diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index a656fe341c8e0..7a14c6ec21a1a 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -4117,15 +4117,15 @@ let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVecto def selectpd_512 : X86Builtin<"_Vector<8, double>(unsigned char, _Vector<8, double>, _Vector<8, double>)">; } -let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512fp16", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def selectsh_128 : X86Builtin<"_Vector<8, _Float16>(unsigned char, _Vector<8, _Float16>, _Vector<8, _Float16>)">; } -let Features = "avx512bf16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512bf16", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def selectsbf_128 : X86Builtin<"_Vector<8, __bf16>(unsigned char, _Vector<8, __bf16>, _Vector<8, __bf16>)">; } -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def selectss_128 : X86Builtin<"_Vector<4, float>(unsigned char, _Vector<4, float>, _Vector<4, float>)">; def selectsd_128 : X86Builtin<"_Vector<2, double>(unsigned char, _Vector<2, double>, _Vector<2, double>)">; } diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td index 71c1ab7ef003f..15212bf7c5dbd 100644 --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -129,6 +129,9 @@ def err_drv_bad_offload_arch_combo : Error< "invalid offload arch combinations: '%0' and '%1' (for a specific processor, " "a feature should either exist in all offload archs, or not exist in any " "offload archs)">; +def err_drv_unsupported_option_for_offload_arch_req_feature : Error< + "'%0' option for offload arch '%1' is not currently supported " + "there. Use it with an offload arch containing '%2' instead">; def warn_drv_unsupported_option_for_offload_arch_req_feature : Warning< "ignoring '%0' option for offload arch '%1' as it is not currently supported " "there. Use it with an offload arch containing '%2' instead">, @@ -136,6 +139,13 @@ def warn_drv_unsupported_option_for_offload_arch_req_feature : Warning< def warn_drv_unsupported_option_for_target : Warning< "ignoring '%0' option as it is not currently supported for target '%1'">, InGroup; +def err_drv_unsupported_option_for_target : Error< + "'%0' option is not currently supported for target '%1'">; +def warn_drv_unsupported_option_part_for_target : Warning< + "ignoring '%0' in '%1' option as it is not currently supported for target '%2'">, + InGroup; +def err_drv_unsupported_option_part_for_target : Error< + "'%0' in '%1' option is not currently supported for target '%2'">; def warn_drv_invalid_argument_for_flang : Warning< "'%0' is not valid for Fortran">, InGroup; diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h index 737ceac80635b..be9965ae3101f 100644 --- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h +++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h @@ -131,6 +131,14 @@ class CIRBaseBuilderTy : public mlir::OpBuilder { return cir::IntType::get(getContext(), n, false); } + static unsigned getCIRIntOrFloatBitWidth(mlir::Type eltTy) { + if (auto intType = mlir::dyn_cast(eltTy)) + return intType.getWidth(); + if (auto floatType = mlir::dyn_cast(eltTy)) + return floatType.getWidth(); + + llvm_unreachable("Unsupported type in getCIRIntOrFloatBitWidth"); + } cir::IntType getSIntNTy(int n) { return cir::IntType::get(getContext(), n, true); } @@ -565,6 +573,16 @@ class CIRBaseBuilderTy : public mlir::OpBuilder { return cir::CmpOp::create(*this, loc, getBoolTy(), kind, lhs, rhs); } + cir::VecCmpOp createVecCompare(mlir::Location loc, cir::CmpOpKind kind, + mlir::Value lhs, mlir::Value rhs) { + VectorType vecCast = mlir::cast(lhs.getType()); + IntType integralTy = + getSIntNTy(getCIRIntOrFloatBitWidth(vecCast.getElementType())); + VectorType integralVecTy = + VectorType::get(context, integralTy, vecCast.getSize()); + return cir::VecCmpOp::create(*this, loc, integralVecTy, kind, lhs, rhs); + } + mlir::Value createIsNaN(mlir::Location loc, mlir::Value operand) { return createCompare(loc, cir::CmpOpKind::ne, operand, operand); } diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h index d93ee2675b366..34c2476ffccce 100644 --- a/clang/include/clang/CIR/MissingFeatures.h +++ b/clang/include/clang/CIR/MissingFeatures.h @@ -259,6 +259,7 @@ struct MissingFeatures { static bool emitBranchThroughCleanup() { return false; } static bool emitCheckedInBoundsGEP() { return false; } static bool emitCondLikelihoodViaExpectIntrinsic() { return false; } + static bool emitConstrainedFPCall() { return false; } static bool emitLifetimeMarkers() { return false; } static bool emitLValueAlignmentAssumption() { return false; } static bool emitNullCheckForDeleteCalls() { return false; } diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td index 08b452888b161..d346ad1efb588 100644 --- a/clang/include/clang/Options/Options.td +++ b/clang/include/clang/Options/Options.td @@ -201,6 +201,10 @@ def hlsl_Group : OptionGroup<"">, Group, DocName<"HLSL options">, Visibility<[ClangOption]>; +def fsan_cov_Group : OptionGroup<"<-fsanitize-coverage group>">, + Group, + DocName<"Sanitizer Coverage options">; + // Feature groups - these take command line options that correspond directly to // target specific features and can be translated directly from command line // options. @@ -2413,26 +2417,26 @@ def : Flag<["-"], "fno-sanitize-blacklist">, Group, Flags<[HelpHidden]>, Alias; def fsanitize_coverage : CommaJoined<["-"], "fsanitize-coverage=">, - Group, + Group, HelpText<"Specify the type of coverage instrumentation for Sanitizers">; def fno_sanitize_coverage : CommaJoined<["-"], "fno-sanitize-coverage=">, - Group, Visibility<[ClangOption, CLOption]>, + Group, Visibility<[ClangOption, CLOption]>, HelpText<"Disable features of coverage instrumentation for Sanitizers">, Values<"func,bb,edge,indirect-calls,trace-bb,trace-cmp,trace-div,trace-gep," "8bit-counters,trace-pc,trace-pc-guard,no-prune,inline-8bit-counters," "inline-bool-flag">; def fsanitize_coverage_allowlist : Joined<["-"], "fsanitize-coverage-allowlist=">, - Group, Visibility<[ClangOption, CLOption]>, + Group, Visibility<[ClangOption, CLOption]>, HelpText<"Restrict sanitizer coverage instrumentation exclusively to modules and functions that match the provided special case list, except the blocked ones">, MarshallingInfoStringVector>; def fsanitize_coverage_ignorelist : Joined<["-"], "fsanitize-coverage-ignorelist=">, - Group, Visibility<[ClangOption, CLOption]>, + Group, Visibility<[ClangOption, CLOption]>, HelpText<"Disable sanitizer coverage instrumentation for modules and functions " "that match the provided special case list, even the allowed ones">, MarshallingInfoStringVector>; def fsanitize_coverage_stack_depth_callback_min_EQ : Joined<["-"], "fsanitize-coverage-stack-depth-callback-min=">, - Group, + Group, MetaVarName<"">, HelpText<"Use callback for max stack depth tracing with minimum stack " "depth M">, @@ -8068,70 +8072,87 @@ def linker_option : Joined<["--"], "linker-option=">, HelpText<"Add linker option">, MarshallingInfoStringVector>; def fsanitize_coverage_type : Joined<["-"], "fsanitize-coverage-type=">, + Group, HelpText<"Sanitizer coverage type">, MarshallingInfoInt>; def fsanitize_coverage_indirect_calls : Flag<["-"], "fsanitize-coverage-indirect-calls">, + Group, HelpText<"Enable sanitizer coverage for indirect calls">, MarshallingInfoFlag>; def fsanitize_coverage_trace_bb : Flag<["-"], "fsanitize-coverage-trace-bb">, + Group, HelpText<"Enable basic block tracing in sanitizer coverage">, MarshallingInfoFlag>; def fsanitize_coverage_trace_cmp : Flag<["-"], "fsanitize-coverage-trace-cmp">, + Group, HelpText<"Enable cmp instruction tracing in sanitizer coverage">, MarshallingInfoFlag>; def fsanitize_coverage_trace_div : Flag<["-"], "fsanitize-coverage-trace-div">, + Group, HelpText<"Enable div instruction tracing in sanitizer coverage">, MarshallingInfoFlag>; def fsanitize_coverage_trace_gep : Flag<["-"], "fsanitize-coverage-trace-gep">, + Group, HelpText<"Enable gep instruction tracing in sanitizer coverage">, MarshallingInfoFlag>; def fsanitize_coverage_8bit_counters : Flag<["-"], "fsanitize-coverage-8bit-counters">, + Group, HelpText<"Enable frequency counters in sanitizer coverage">, MarshallingInfoFlag>; def fsanitize_coverage_inline_8bit_counters : Flag<["-"], "fsanitize-coverage-inline-8bit-counters">, + Group, HelpText<"Enable inline 8-bit counters in sanitizer coverage">, MarshallingInfoFlag>; def fsanitize_coverage_inline_bool_flag : Flag<["-"], "fsanitize-coverage-inline-bool-flag">, + Group, HelpText<"Enable inline bool flag in sanitizer coverage">, MarshallingInfoFlag>; def fsanitize_coverage_pc_table : Flag<["-"], "fsanitize-coverage-pc-table">, + Group, HelpText<"Create a table of coverage-instrumented PCs">, MarshallingInfoFlag>; def fsanitize_coverage_control_flow : Flag<["-"], "fsanitize-coverage-control-flow">, + Group, HelpText<"Collect control flow of function">, MarshallingInfoFlag>; def fsanitize_coverage_trace_pc : Flag<["-"], "fsanitize-coverage-trace-pc">, + Group, HelpText<"Enable PC tracing in sanitizer coverage">, MarshallingInfoFlag>; def fsanitize_coverage_trace_pc_guard : Flag<["-"], "fsanitize-coverage-trace-pc-guard">, + Group, HelpText<"Enable PC tracing with guard in sanitizer coverage">, MarshallingInfoFlag>; def fsanitize_coverage_no_prune : Flag<["-"], "fsanitize-coverage-no-prune">, + Group, HelpText<"Disable coverage pruning (i.e. instrument all blocks/edges)">, MarshallingInfoFlag>; def fsanitize_coverage_stack_depth : Flag<["-"], "fsanitize-coverage-stack-depth">, + Group, HelpText<"Enable max stack depth tracing">, MarshallingInfoFlag>; def fsanitize_coverage_trace_loads : Flag<["-"], "fsanitize-coverage-trace-loads">, + Group, HelpText<"Enable tracing of loads">, MarshallingInfoFlag>; def fsanitize_coverage_trace_stores : Flag<["-"], "fsanitize-coverage-trace-stores">, + Group, HelpText<"Enable tracing of stores">, MarshallingInfoFlag>; def fexperimental_sanitize_metadata_EQ_covered diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index f8bbfed8bb387..8779ffab13b86 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -6057,6 +6057,7 @@ bool Compiler::visitSwitchStmt(const SwitchStmt *S) { DefaultLabel); if (!this->visitStmt(S->getBody())) return false; + this->fallthrough(EndLabel); this->emitLabel(EndLabel); return LS.destroyLocals(); @@ -6064,6 +6065,7 @@ bool Compiler::visitSwitchStmt(const SwitchStmt *S) { template bool Compiler::visitCaseStmt(const CaseStmt *S) { + this->fallthrough(CaseLabels[S]); this->emitLabel(CaseLabels[S]); return this->visitStmt(S->getSubStmt()); } diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 30426565407ba..5a96320e12b6f 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2838,6 +2838,30 @@ static bool interp__builtin_select(InterpState &S, CodePtr OpPC, return true; } +/// Scalar variant of AVX512 predicated select: +/// Result[i] = (Mask bit 0) ? LHS[i] : RHS[i], but only element 0 may change. +/// All other elements are taken from RHS. +static bool interp__builtin_select_scalar(InterpState &S, + const CallExpr *Call) { + unsigned N = + Call->getArg(1)->getType()->getAs()->getNumElements(); + + const Pointer &W = S.Stk.pop(); + const Pointer &A = S.Stk.pop(); + APSInt U = popToAPSInt(S, Call->getArg(0)); + const Pointer &Dst = S.Stk.peek(); + + bool TakeA0 = U.getZExtValue() & 1ULL; + + for (unsigned I = TakeA0; I != N; ++I) + Dst.elem(I) = W.elem(I); + if (TakeA0) + Dst.elem(0) = A.elem(0); + + Dst.initializeAllElements(); + return true; +} + static bool interp__builtin_blend(InterpState &S, CodePtr OpPC, const CallExpr *Call) { APSInt Mask = popToAPSInt(S, Call->getArg(2)); @@ -4151,6 +4175,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return APInt::getAllOnes(DstBits); }); + case clang::X86::BI__builtin_ia32_selectss_128: + case clang::X86::BI__builtin_ia32_selectsd_128: + case clang::X86::BI__builtin_ia32_selectsh_128: + case clang::X86::BI__builtin_ia32_selectsbf_128: + return interp__builtin_select_scalar(S, Call); case clang::X86::BI__builtin_ia32_vprotbi: case clang::X86::BI__builtin_ia32_vprotdi: case clang::X86::BI__builtin_ia32_vprotqi: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index ed1f1b7508ffc..74f6e3acb6b39 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12202,6 +12202,24 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), SourceLen), E); }; + auto EvalSelectScalar = [&](unsigned Len) -> bool { + APSInt Mask; + APValue AVal, WVal; + if (!EvaluateInteger(E->getArg(0), Mask, Info) || + !EvaluateAsRValue(Info, E->getArg(1), AVal) || + !EvaluateAsRValue(Info, E->getArg(2), WVal)) + return false; + + bool TakeA0 = (Mask.getZExtValue() & 1u) != 0; + SmallVector Res; + Res.reserve(Len); + Res.push_back(TakeA0 ? AVal.getVectorElt(0) : WVal.getVectorElt(0)); + for (unsigned I = 1; I < Len; ++I) + Res.push_back(WVal.getVectorElt(I)); + APValue V(Res.data(), Res.size()); + return Success(V, E); + }; + switch (E->getBuiltinCallee()) { default: return false; @@ -12505,6 +12523,13 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return APInt((Src).trunc(DstBits)); return APInt::getAllOnes(DstBits); }); + case clang::X86::BI__builtin_ia32_selectss_128: + return EvalSelectScalar(4); + case clang::X86::BI__builtin_ia32_selectsd_128: + return EvalSelectScalar(2); + case clang::X86::BI__builtin_ia32_selectsh_128: + case clang::X86::BI__builtin_ia32_selectsbf_128: + return EvalSelectScalar(8); case clang::X86::BI__builtin_ia32_pmuldq128: case clang::X86::BI__builtin_ia32_pmuldq256: case clang::X86::BI__builtin_ia32_pmuldq512: diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp index ba160373ec77e..ee6900141647f 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp @@ -33,18 +33,53 @@ static mlir::Value emitIntrinsicCallOp(CIRGenFunction &cgf, const CallExpr *e, .getResult(); } +// OG has unordered comparison as a form of optimization in addition to +// ordered comparison, while CIR doesn't. +// +// This means that we can't encode the comparison code of UGT (unordered +// greater than), at least not at the CIR level. +// +// The boolean shouldInvert compensates for this. +// For example: to get to the comparison code UGT, we pass in +// emitVectorFCmp (OLE, shouldInvert = true) since OLE is the inverse of UGT. + +// There are several ways to support this otherwise: +// - register extra CmpOpKind for unordered comparison types and build the +// translation code for +// to go from CIR -> LLVM dialect. Notice we get this naturally with +// shouldInvert, benefiting from existing infrastructure, albeit having to +// generate an extra `not` at CIR). +// - Just add extra comparison code to a new VecCmpOpKind instead of +// cluttering CmpOpKind. +// - Add a boolean in VecCmpOp to indicate if it's doing unordered or ordered +// comparison +// - Just emit the intrinsics call instead of calling this helper, see how the +// LLVM lowering handles this. +static mlir::Value emitVectorFCmp(CIRGenBuilderTy &builder, + llvm::SmallVector &ops, + mlir::Location loc, cir::CmpOpKind pred, + bool shouldInvert) { + assert(!cir::MissingFeatures::cgFPOptionsRAII()); + // TODO(cir): Add isSignaling boolean once emitConstrainedFPCall implemented + assert(!cir::MissingFeatures::emitConstrainedFPCall()); + mlir::Value cmp = builder.createVecCompare(loc, pred, ops[0], ops[1]); + mlir::Value bitCast = builder.createBitcast( + shouldInvert ? builder.createNot(cmp) : cmp, ops[0].getType()); + return bitCast; +} + mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, - const CallExpr *e) { + const CallExpr *expr) { if (builtinID == Builtin::BI__builtin_cpu_is) { - cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_is"); + cgm.errorNYI(expr->getSourceRange(), "__builtin_cpu_is"); return {}; } if (builtinID == Builtin::BI__builtin_cpu_supports) { - cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_supports"); + cgm.errorNYI(expr->getSourceRange(), "__builtin_cpu_supports"); return {}; } if (builtinID == Builtin::BI__builtin_cpu_init) { - cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_init"); + cgm.errorNYI(expr->getSourceRange(), "__builtin_cpu_init"); return {}; } @@ -65,7 +100,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, getContext().GetBuiltinType(builtinID, error, &iceArguments); assert(error == ASTContext::GE_None && "Error while getting builtin type."); - for (auto [idx, arg] : llvm::enumerate(e->arguments())) + for (auto [idx, arg] : llvm::enumerate(expr->arguments())) ops.push_back(emitScalarOrConstFoldImmArg(iceArguments, idx, arg)); CIRGenBuilderTy &builder = getBuilder(); @@ -75,15 +110,15 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, default: return {}; case X86::BI_mm_clflush: - return emitIntrinsicCallOp(*this, e, "x86.sse2.clflush", voidTy, ops[0]); + return emitIntrinsicCallOp(*this, expr, "x86.sse2.clflush", voidTy, ops[0]); case X86::BI_mm_lfence: - return emitIntrinsicCallOp(*this, e, "x86.sse2.lfence", voidTy); + return emitIntrinsicCallOp(*this, expr, "x86.sse2.lfence", voidTy); case X86::BI_mm_pause: - return emitIntrinsicCallOp(*this, e, "x86.sse2.pause", voidTy); + return emitIntrinsicCallOp(*this, expr, "x86.sse2.pause", voidTy); case X86::BI_mm_mfence: - return emitIntrinsicCallOp(*this, e, "x86.sse2.mfence", voidTy); + return emitIntrinsicCallOp(*this, expr, "x86.sse2.mfence", voidTy); case X86::BI_mm_sfence: - return emitIntrinsicCallOp(*this, e, "x86.sse.sfence", voidTy); + return emitIntrinsicCallOp(*this, expr, "x86.sse.sfence", voidTy); case X86::BI_mm_prefetch: case X86::BI__rdtsc: case X86::BI__builtin_ia32_rdtscp: @@ -96,7 +131,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI__builtin_ia32_undef128: case X86::BI__builtin_ia32_undef256: case X86::BI__builtin_ia32_undef512: - cgm.errorNYI(e->getSourceRange(), + cgm.errorNYI(expr->getSourceRange(), std::string("unimplemented X86 builtin call: ") + getContext().BuiltinInfo.getName(builtinID)); return {}; @@ -118,12 +153,12 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, index &= numElts - 1; cir::ConstantOp indexVal = - builder.getUInt64(index, getLoc(e->getExprLoc())); + builder.getUInt64(index, getLoc(expr->getExprLoc())); // These builtins exist so we can ensure the index is an ICE and in range. // Otherwise we could just do this in the header file. - return cir::VecExtractOp::create(builder, getLoc(e->getExprLoc()), ops[0], - indexVal); + return cir::VecExtractOp::create(builder, getLoc(expr->getExprLoc()), + ops[0], indexVal); } case X86::BI__builtin_ia32_vec_set_v4hi: case X86::BI__builtin_ia32_vec_set_v16qi: @@ -758,10 +793,18 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI__builtin_ia32_cmpunordpd: case X86::BI__builtin_ia32_cmpneqps: case X86::BI__builtin_ia32_cmpneqpd: + cgm.errorNYI(expr->getSourceRange(), + std::string("unimplemented X86 builtin call: ") + + getContext().BuiltinInfo.getName(builtinID)); + return {}; case X86::BI__builtin_ia32_cmpnltps: case X86::BI__builtin_ia32_cmpnltpd: + return emitVectorFCmp(builder, ops, getLoc(expr->getExprLoc()), + cir::CmpOpKind::lt, /*shouldInvert=*/true); case X86::BI__builtin_ia32_cmpnleps: case X86::BI__builtin_ia32_cmpnlepd: + return emitVectorFCmp(builder, ops, getLoc(expr->getExprLoc()), + cir::CmpOpKind::le, /*shouldInvert=*/true); case X86::BI__builtin_ia32_cmpordps: case X86::BI__builtin_ia32_cmpordpd: case X86::BI__builtin_ia32_cmpph128_mask: @@ -846,7 +889,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3: case X86::BI__builtin_ia32_vfmaddcsh_round_mask3: case X86::BI__builtin_ia32_prefetchi: - cgm.errorNYI(e->getSourceRange(), + cgm.errorNYI(expr->getSourceRange(), std::string("unimplemented X86 builtin call: ") + getContext().BuiltinInfo.getName(builtinID)); return {}; diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index 3b5411179349b..4f5eaed1f5418 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -1115,24 +1115,9 @@ ROCMToolChain::getCommonDeviceLibNames( bool AMDGPUToolChain::shouldSkipSanitizeOption( const ToolChain &TC, const llvm::opt::ArgList &DriverArgs, StringRef TargetID, const llvm::opt::Arg *A) const { - // For actions without targetID, do nothing. - if (TargetID.empty()) - return false; - Option O = A->getOption(); - - if (!O.matches(options::OPT_fsanitize_EQ)) - return false; - - if (!DriverArgs.hasFlag(options::OPT_fgpu_sanitize, - options::OPT_fno_gpu_sanitize, true)) - return true; - auto &Diags = TC.getDriver().getDiags(); - - // For simplicity, we only allow -fsanitize=address - SanitizerMask K = parseSanitizerValue(A->getValue(), /*AllowGroups=*/false); - if (K != SanitizerKind::Address) - return true; + bool IsExplicitDevice = + A->getBaseArg().getOption().matches(options::OPT_Xarch_device); // Check 'xnack+' availability by default llvm::StringRef Processor = @@ -1153,10 +1138,17 @@ bool AMDGPUToolChain::shouldSkipSanitizeOption( (void)OptionalGpuArch; auto Loc = FeatureMap.find("xnack"); if (Loc == FeatureMap.end() || !Loc->second) { - Diags.Report( - clang::diag::warn_drv_unsupported_option_for_offload_arch_req_feature) - << A->getAsString(DriverArgs) << TargetID << "xnack+"; + if (IsExplicitDevice) { + Diags.Report( + clang::diag::err_drv_unsupported_option_for_offload_arch_req_feature) + << A->getAsString(DriverArgs) << TargetID << "xnack+"; + } else { + Diags.Report( + clang::diag::warn_drv_unsupported_option_for_offload_arch_req_feature) + << A->getAsString(DriverArgs) << TargetID << "xnack+"; + } return true; } + return false; } diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h index c8601d1dedbcb..28e01dc64860a 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.h +++ b/clang/lib/Driver/ToolChains/AMDGPU.h @@ -144,7 +144,7 @@ class LLVM_LIBRARY_VISIBILITY AMDGPUToolChain : public Generic_ELF { /// Needed for translating LTO options. const char *getDefaultLinker() const override { return "ld.lld"; } - /// Should skip sanitize options. + /// Should skip sanitize option. bool shouldSkipSanitizeOption(const ToolChain &TC, const llvm::opt::ArgList &DriverArgs, StringRef TargetID, @@ -203,18 +203,79 @@ class LLVM_LIBRARY_VISIBILITY ROCMToolChain : public AMDGPUToolChain { return SanitizerKind::Address; } - void diagnoseUnsupportedSanitizers(const llvm::opt::ArgList &Args) const { - if (!Args.hasFlag(options::OPT_fgpu_sanitize, options::OPT_fno_gpu_sanitize, - true)) - return; + bool diagnoseUnsupportedOption(const llvm::opt::Arg *A, + const llvm::opt::DerivedArgList &DAL, + const llvm::opt::ArgList &DriverArgs, + const char *Value = nullptr) const { auto &Diags = getDriver().getDiags(); - for (auto *A : Args.filtered(options::OPT_fsanitize_EQ)) { - SanitizerMask K = - parseSanitizerValue(A->getValue(), /*Allow Groups*/ false); - if (K != SanitizerKind::Address) - Diags.Report(clang::diag::warn_drv_unsupported_option_for_target) - << A->getAsString(Args) << getTriple().str(); + bool IsExplicitDevice = + A->getBaseArg().getOption().matches(options::OPT_Xarch_device); + + if (Value) { + unsigned DiagID = + IsExplicitDevice + ? clang::diag::err_drv_unsupported_option_part_for_target + : clang::diag::warn_drv_unsupported_option_part_for_target; + Diags.Report(DiagID) << Value << A->getAsString(DriverArgs) + << getTriple().str(); + } else { + unsigned DiagID = + IsExplicitDevice + ? clang::diag::err_drv_unsupported_option_for_target + : clang::diag::warn_drv_unsupported_option_for_target; + Diags.Report(DiagID) << A->getAsString(DAL) << getTriple().str(); } + return true; + } + + bool handleSanitizeOption(const ToolChain &TC, llvm::opt::DerivedArgList &DAL, + const llvm::opt::ArgList &DriverArgs, + StringRef TargetID, const llvm::opt::Arg *A) const { + if (TargetID.empty()) + return false; + // If we shouldn't do sanitizing, skip it. + if (!DriverArgs.hasFlag(options::OPT_fgpu_sanitize, + options::OPT_fno_gpu_sanitize, true)) + return true; + const llvm::opt::Option &Opt = A->getOption(); + // Sanitizer coverage is currently not supported for AMDGPU, so warn/error + // on every related option. + if (Opt.matches(options::OPT_fsan_cov_Group)) { + diagnoseUnsupportedOption(A, DAL, DriverArgs); + } + // If this isn't a sanitizer option, don't handle it. + if (!Opt.matches(options::OPT_fsanitize_EQ)) + return false; + + SmallVector SupportedSanitizers; + SmallVector UnSupportedSanitizers; + + for (const char *Value : A->getValues()) { + SanitizerMask K = parseSanitizerValue(Value, /*Allow Groups*/ false); + if (K & ROCMToolChain::getSupportedSanitizers()) + SupportedSanitizers.push_back(Value); + else + UnSupportedSanitizers.push_back(Value); + } + + // If there are no supported sanitizers, drop the whole argument. + if (SupportedSanitizers.empty()) { + diagnoseUnsupportedOption(A, DAL, DriverArgs); + return true; + } + // If only some sanitizers are unsupported, report each one individually. + if (!UnSupportedSanitizers.empty()) { + for (const char *Value : UnSupportedSanitizers) { + diagnoseUnsupportedOption(A, DAL, DriverArgs, Value); + } + } + // If we know the target arch, check if the sanitizer is supported for it. + if (shouldSkipSanitizeOption(TC, DriverArgs, TargetID, A)) + return true; + + // Add a new argument with only the supported sanitizers. + DAL.AddJoinedArg(A, A->getOption(), llvm::join(SupportedSanitizers, ",")); + return true; } }; diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp index a96d40166fdb7..5ad3d06854e42 100644 --- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp @@ -357,8 +357,6 @@ AMDGPUOpenMPToolChain::AMDGPUOpenMPToolChain(const Driver &D, const llvm::Triple // Lookup binaries into the driver directory, this is used to // discover the 'amdgpu-arch' executable. getProgramPaths().push_back(getDriver().Dir); - // Diagnose unsupported sanitizer options only once. - diagnoseUnsupportedSanitizers(Args); } void AMDGPUOpenMPToolChain::addClangTargetOptions( @@ -456,16 +454,11 @@ llvm::opt::DerivedArgList *AMDGPUOpenMPToolChain::TranslateArgs( const OptTable &Opts = getDriver().getOpts(); - // Skip sanitize options passed from the HostTC. Claim them early. - // The decision to sanitize device code is computed only by - // 'shouldSkipSanitizeOption'. - if (DAL->hasArg(options::OPT_fsanitize_EQ)) - DAL->claimAllArgs(options::OPT_fsanitize_EQ); - - for (Arg *A : Args) - if (!shouldSkipSanitizeOption(*this, Args, BoundArch, A) && - !llvm::is_contained(*DAL, A)) + for (Arg *A : Args) { + // Filter unsupported sanitizers passed from the HostTC. + if (!handleSanitizeOption(*this, *DAL, Args, BoundArch, A)) DAL->append(A); + } if (!BoundArch.empty()) { DAL->eraseArg(options::OPT_march_EQ); @@ -557,9 +550,8 @@ void AMDGPUOpenMPToolChain::AddIAMCUIncludeArgs(const ArgList &Args, SanitizerMask AMDGPUOpenMPToolChain::getSupportedSanitizers() const { // The AMDGPUOpenMPToolChain only supports sanitizers in the sense that it // allows sanitizer arguments on the command line if they are supported by the - // host toolchain. The AMDGPUOpenMPToolChain will actually ignore any command - // line arguments for any of these "supported" sanitizers. That means that no - // sanitization of device code is actually supported at this time. + // host toolchain. The AMDGPUOpenMPToolChain will later filter unsupported + // sanitizers from the command line arguments. // // This behavior is necessary because the host and device toolchains // invocations often share the command line, so the device toolchain must diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp index b1c30beae1d35..38d56a24ef006 100644 --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp @@ -228,8 +228,6 @@ HIPAMDToolChain::HIPAMDToolChain(const Driver &D, const llvm::Triple &Triple, // Lookup binaries into the driver directory, this is used to // discover the clang-offload-bundler executable. getProgramPaths().push_back(getDriver().Dir); - // Diagnose unsupported sanitizer options only once. - diagnoseUnsupportedSanitizers(Args); } void HIPAMDToolChain::addActionsFromClangTargetOptions( @@ -306,7 +304,8 @@ HIPAMDToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args, const OptTable &Opts = getDriver().getOpts(); for (Arg *A : Args) { - if (!shouldSkipSanitizeOption(*this, Args, BoundArch, A)) + // Filter unsupported sanitizers passed from the HostTC. + if (!handleSanitizeOption(*this, *DAL, Args, BoundArch, A)) DAL->append(A); } @@ -362,9 +361,8 @@ void HIPAMDToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs, SanitizerMask HIPAMDToolChain::getSupportedSanitizers() const { // The HIPAMDToolChain only supports sanitizers in the sense that it allows // sanitizer arguments on the command line if they are supported by the host - // toolchain. The HIPAMDToolChain will actually ignore any command line - // arguments for any of these "supported" sanitizers. That means that no - // sanitization of device code is actually supported at this time. + // toolchain. The HIPAMDToolChain will later filter unsupported sanitizers + // from the command line arguments. // // This behavior is necessary because the host and device toolchains // invocations often share the command line, so the device toolchain must diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h index 9f5b726d7b789..3df6930f94be3 100644 --- a/clang/lib/Headers/avx10_2bf16intrin.h +++ b/clang/lib/Headers/avx10_2bf16intrin.h @@ -221,12 +221,12 @@ static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_move_sbh(__m128bh __a, return __a; } -static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +static __inline__ __m128bh __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_move_sbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { return __builtin_ia32_selectsbf_128(__U, _mm_move_sbh(__A, __B), __W); } -static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +static __inline__ __m128bh __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_move_sbh(__mmask8 __U, __m128bh __A, __m128bh __B) { return __builtin_ia32_selectsbf_128(__U, _mm_move_sbh(__A, __B), _mm_setzero_pbh()); diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 53b18df764370..e4184795e47e9 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -1369,17 +1369,15 @@ _mm512_mul_epi32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, (__v8di)_mm512_mul_epi32(__X, __Y), (__v8di)__W); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, (__v8di)_mm512_mul_epi32(__X, __Y), (__v8di)_mm512_setzero_si512 ()); @@ -1390,17 +1388,15 @@ _mm512_mul_epu32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, (__v8di)_mm512_mul_epu32(__X, __Y), (__v8di)__W); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, (__v8di)_mm512_mul_epu32(__X, __Y), (__v8di)_mm512_setzero_si512 ()); @@ -1820,14 +1816,14 @@ _mm512_maskz_abs_epi32(__mmask16 __U, __m512i __A) { (__v16si)_mm512_setzero_si512()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_add_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_add_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, __W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_add_ss(__mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_add_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } @@ -1850,14 +1846,14 @@ _mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) { (__v4sf)_mm_setzero_ps(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_add_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_add_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, __W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_add_sd(__mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_add_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); } @@ -1879,28 +1875,28 @@ _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) { (__v2df)_mm_setzero_pd(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_add_pd(__A, __B), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_add_pd(__A, __B), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_add_ps(__A, __B), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_add_ps(__A, __B), @@ -1935,14 +1931,14 @@ _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ (__v16sf)_mm512_setzero_ps())) -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_sub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_sub_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, __W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_sub_ss(__mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_sub_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } @@ -1964,14 +1960,14 @@ _mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) { (__v4sf)_mm_setzero_ps(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_sub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_sub_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, __W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_sub_sd(__mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_sub_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); } @@ -1994,28 +1990,28 @@ _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) { (__v2df)_mm_setzero_pd(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_sub_pd(__A, __B), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_sub_pd(__A, __B), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_sub_ps(__A, __B), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_sub_ps(__A, __B), @@ -2050,14 +2046,14 @@ _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ (__v16sf)_mm512_setzero_ps())) -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_mul_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_mul_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, __W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_mul_ss(__mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_mul_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } @@ -2079,14 +2075,14 @@ _mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) { (__v4sf)_mm_setzero_ps(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_mul_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_mul_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, __W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_mul_sd(__mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_mul_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); } @@ -2109,28 +2105,28 @@ _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) { (__v2df)_mm_setzero_pd(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_mul_pd(__A, __B), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_mul_pd(__A, __B), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_mul_ps(__A, __B), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_mul_ps(__A, __B), @@ -2165,14 +2161,14 @@ _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ (__v16sf)_mm512_setzero_ps())) -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_div_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_div_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, __W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_div_ss(__mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_div_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } @@ -2195,14 +2191,14 @@ _mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) { (__v4sf)_mm_setzero_ps(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_div_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_div_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, __W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_div_sd(__mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_div_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); } @@ -2230,14 +2226,14 @@ static __inline __m512d return (__m512d)((__v8df)__a/(__v8df)__b); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_div_pd(__A, __B), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_div_pd(__A, __B), @@ -2249,14 +2245,14 @@ _mm512_div_ps(__m512 __a, __m512 __b) { return (__m512)((__v16sf)__a/(__v16sf)__b); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_div_ps(__A, __B), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_div_ps(__A, __B), diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h index a6936da056abc..26ac67bb2b7a7 100644 --- a/clang/lib/Headers/avx512fp16intrin.h +++ b/clang/lib/Headers/avx512fp16intrin.h @@ -594,23 +594,20 @@ _mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) { (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_add_sh(__m128h __A, __m128h __B) { __A[0] += __B[0]; return __A; } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_add_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_add_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, __W); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_add_sh(__mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_add_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); } @@ -630,23 +627,20 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U, (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A, - __m128h __B) { +static __inline__ __m128h + __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_sub_sh(__m128h __A, __m128h __B) { __A[0] -= __B[0]; return __A; } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_sub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_sub_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, __W); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_sub_sh(__mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_sub_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); } @@ -666,23 +660,20 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U, (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A, - __m128h __B) { +static __inline__ __m128h + __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mul_sh(__m128h __A, __m128h __B) { __A[0] *= __B[0]; return __A; } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_mul_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_mul_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, __W); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_mul_sh(__mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_mul_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); } @@ -702,23 +693,20 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U, (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A, - __m128h __B) { +static __inline__ __m128h + __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_div_sh(__m128h __A, __m128h __B) { __A[0] /= __B[0]; return __A; } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_div_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_div_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, __W); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_div_sh(__mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_div_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); } @@ -966,22 +954,19 @@ static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P, } // moves with vmovsh: -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a, - __m128h __b) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_move_sh(__m128h __a, __m128h __b) { __a[0] = __b[0]; return __a; } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_move_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_move_sh(__mmask8 __U, __m128h __A, __m128h __B) { return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), _mm_setzero_ph()); } diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h index 1e6e42df6b5fb..5a1b540e07e3a 100644 --- a/clang/lib/Headers/avx512vlintrin.h +++ b/clang/lib/Headers/avx512vlintrin.h @@ -347,65 +347,57 @@ _mm_maskz_sub_epi64(__mmask8 __U, __m128i __A, __m128i __B) { (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, (__v4di)_mm256_mul_epi32(__X, __Y), (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, (__v4di)_mm256_mul_epi32(__X, __Y), (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, (__v2di)_mm_mul_epi32(__X, __Y), (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, (__v2di)_mm_mul_epi32(__X, __Y), (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, (__v4di)_mm256_mul_epu32(__X, __Y), (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, (__v4di)_mm256_mul_epu32(__X, __Y), (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, (__v2di)_mm_mul_epu32(__X, __Y), (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, (__v2di)_mm_mul_epu32(__X, __Y), (__v2di)_mm_setzero_si128()); @@ -1426,56 +1418,56 @@ _mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) (__v8sf) __C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_add_pd(__A, __B), (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_add_pd(__A, __B), (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_add_pd(__A, __B), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_add_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_add_ps(__A, __B), (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_add_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_add_ps(__A, __B), (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_add_ps(__A, __B), @@ -2202,56 +2194,56 @@ _mm256_maskz_cvtepu32_ps(__mmask8 __U, __m256i __A) { (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_div_pd(__A, __B), (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_div_pd(__A, __B), (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_div_pd(__A, __B), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_div_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_div_ps(__A, __B), (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_div_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_div_ps(__A, __B), (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_div_ps(__A, __B), @@ -2717,56 +2709,56 @@ _mm256_maskz_min_ps(__mmask8 __U, __m256 __A, __m256 __B) { (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_mul_pd(__A, __B), (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_mul_pd(__A, __B), (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_mul_pd(__A, __B), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_mul_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_mul_ps(__A, __B), (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_mul_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_mul_ps(__A, __B), (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_mul_ps(__A, __B), @@ -3500,56 +3492,56 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8sf)_mm256_setzero_ps()); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_sub_pd(__A, __B), (__v2df)__W); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_sub_pd(__A, __B), (__v2df)_mm_setzero_pd()); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_sub_pd(__A, __B), (__v4df)__W); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_sub_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_sub_ps(__A, __B), (__v4sf)__W); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_sub_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_sub_ps(__A, __B), (__v8sf)__W); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_sub_ps(__A, __B), diff --git a/clang/lib/Interpreter/IncrementalExecutor.cpp b/clang/lib/Interpreter/IncrementalExecutor.cpp index 45620fcd358c8..74a489f4b3ac9 100644 --- a/clang/lib/Interpreter/IncrementalExecutor.cpp +++ b/clang/lib/Interpreter/IncrementalExecutor.cpp @@ -18,7 +18,6 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ExecutionEngine/ExecutionEngine.h" #include "llvm/ExecutionEngine/Orc/CompileUtils.h" -#include "llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h" #include "llvm/ExecutionEngine/Orc/Debugging/DebuggerSupport.h" #include "llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h" #include "llvm/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.h" diff --git a/clang/test/AST/ByteCode/literals.cpp b/clang/test/AST/ByteCode/literals.cpp index 5028ebfa3de30..c6d79f9c60058 100644 --- a/clang/test/AST/ByteCode/literals.cpp +++ b/clang/test/AST/ByteCode/literals.cpp @@ -1270,6 +1270,17 @@ namespace StmtExprs { namespace CrossFuncLabelDiff { constexpr long a(bool x) { return x ? 0 : (intptr_t)&&lbl + (0 && ({lbl: 0;})); } } + + /// GCC agrees with the bytecode interpreter here. + void switchInSE() { + static_assert(({ // ref-error {{not an integral constant expression}} + int i = 20; + switch(10) { + case 10: i = 300; // ref-note {{a constant expression cannot modify an object that is visible outside that expression}} + } + i; + }) == 300); + } } #endif diff --git a/clang/test/CIR/CodeGen/builtin-fcmp-sse.c b/clang/test/CIR/CodeGen/builtin-fcmp-sse.c new file mode 100644 index 0000000000000..c273d6b3fca0e --- /dev/null +++ b/clang/test/CIR/CodeGen/builtin-fcmp-sse.c @@ -0,0 +1,213 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir +// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll +// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll +// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG + +typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16))); +typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16))); + +__m128 test_cmpnleps(__m128 A, __m128 B) { + // CIR-LABEL: cir.func dso_local @test_cmpnleps( + // CIR: %[[ARG0:.*]]: !cir.vector<4 x !cir.float> {{.*}}, %[[ARG1:.*]]: !cir.vector<4 x !cir.float> {{.*}}) -> !cir.vector<4 x !cir.float> inline(never) { + // CIR: %[[ALLOCA_0:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr>, ["A", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_1:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr>, ["B", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_2:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr>, ["__retval"] {alignment = 16 : i64} + // CIR: cir.store %[[ARG0]], %[[ALLOCA_0]] : !cir.vector<4 x !cir.float>, !cir.ptr> + // CIR: cir.store %[[ARG1]], %[[ALLOCA_1]] : !cir.vector<4 x !cir.float>, !cir.ptr> + // CIR: %[[LOAD_0:.*]] = cir.load align(16) %[[ALLOCA_0]] : !cir.ptr>, !cir.vector<4 x !cir.float> + // CIR: %[[LOAD_1:.*]] = cir.load align(16) %[[ALLOCA_1]] : !cir.ptr>, !cir.vector<4 x !cir.float> + // CIR: %[[VEC_0:.*]] = cir.vec.cmp(le, %[[LOAD_0]], %[[LOAD_1]]) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i> + // CIR: %[[UNARY_0:.*]] = cir.unary(not, %[[VEC_0]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i> + // CIR: %[[CAST_0:.*]] = cir.cast bitcast %[[UNARY_0]] : !cir.vector<4 x !s32i> -> !cir.vector<4 x !cir.float> + // CIR: cir.store %[[CAST_0]], %[[ALLOCA_2]] : !cir.vector<4 x !cir.float>, !cir.ptr> + // CIR: %[[LOAD_2:.*]] = cir.load %[[ALLOCA_2]] : !cir.ptr>, !cir.vector<4 x !cir.float> + // CIR: cir.return %[[LOAD_2]] : !cir.vector<4 x !cir.float> + // CIR: } + + // LLVM-LABEL: define dso_local <4 x float> @test_cmpnleps( + // LLVM-SAME: <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { + // LLVM-NEXT: [[TMP3:%.*]] = alloca <4 x float>, i64 1, align 16 + // LLVM-NEXT: [[TMP4:%.*]] = alloca <4 x float>, i64 1, align 16 + // LLVM-NEXT: [[TMP5:%.*]] = alloca <4 x float>, i64 1, align 16 + // LLVM-NEXT: store <4 x float> [[TMP0]], ptr [[TMP3]], align 16 + // LLVM-NEXT: store <4 x float> [[TMP1]], ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP6:%.*]] = load <4 x float>, ptr [[TMP3]], align 16 + // LLVM-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP8:%.*]] = fcmp ole <4 x float> [[TMP6]], [[TMP7]] + // LLVM-NEXT: [[TMP9:%.*]] = sext <4 x i1> [[TMP8]] to <4 x i32> + // LLVM-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], splat (i32 -1) + // LLVM-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <4 x float> + // LLVM-NEXT: store <4 x float> [[TMP11]], ptr [[TMP5]], align 16 + // LLVM-NEXT: [[TMP12:%.*]] = load <4 x float>, ptr [[TMP5]], align 16 + // LLVM-NEXT: ret <4 x float> [[TMP12]] + + // OGCG-LABEL: define dso_local <4 x float> @test_cmpnleps( + // OGCG-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { + // OGCG-NEXT: [[ENTRY:.*:]] + // OGCG-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 + // OGCG-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 + // OGCG-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 + // OGCG-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 + // OGCG-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP2:%.*]] = fcmp ugt <4 x float> [[TMP0]], [[TMP1]] + // OGCG-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> + // OGCG-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float> + // OGCG-NEXT: ret <4 x float> [[TMP4]] + return __builtin_ia32_cmpnleps(A, B); +} + +__m128d test_cmpnlepd(__m128d A, __m128d B) { + // CIR-LABEL: cir.func dso_local @test_cmpnlepd( + // CIR: %[[ARG0:.*]]: !cir.vector<2 x !cir.double> {{.*}}, %[[ARG1:.*]]: !cir.vector<2 x !cir.double> {{.*}}) -> !cir.vector<2 x !cir.double> inline(never) { + // CIR: %[[ALLOCA_0:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr>, ["A", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_1:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr>, ["B", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_2:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr>, ["__retval"] {alignment = 16 : i64} + // CIR: cir.store %[[ARG0]], %[[ALLOCA_0]] : !cir.vector<2 x !cir.double>, !cir.ptr> + // CIR: cir.store %[[ARG1]], %[[ALLOCA_1]] : !cir.vector<2 x !cir.double>, !cir.ptr> + // CIR: %[[LOAD_0:.*]] = cir.load align(16) %[[ALLOCA_0]] : !cir.ptr>, !cir.vector<2 x !cir.double> + // CIR: %[[LOAD_1:.*]] = cir.load align(16) %[[ALLOCA_1]] : !cir.ptr>, !cir.vector<2 x !cir.double> + // CIR: %[[VEC_0:.*]] = cir.vec.cmp(le, %[[LOAD_0]], %[[LOAD_1]]) : !cir.vector<2 x !cir.double>, !cir.vector<2 x !s64i> + // CIR: %[[UNARY_0:.*]] = cir.unary(not, %[[VEC_0]]) : !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i> + // CIR: %[[CAST_0:.*]] = cir.cast bitcast %[[UNARY_0]] : !cir.vector<2 x !s64i> -> !cir.vector<2 x !cir.double> + // CIR: cir.store %[[CAST_0]], %[[ALLOCA_2]] : !cir.vector<2 x !cir.double>, !cir.ptr> + // CIR: %[[LOAD_2:.*]] = cir.load %[[ALLOCA_2]] : !cir.ptr>, !cir.vector<2 x !cir.double> + // CIR: cir.return %[[LOAD_2]] : !cir.vector<2 x !cir.double> + // CIR: } + + // LLVM-LABEL: define dso_local <2 x double> @test_cmpnlepd( + // LLVM-SAME: <2 x double> [[TMP0:%.*]], <2 x double> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { + // LLVM-NEXT: [[TMP3:%.*]] = alloca <2 x double>, i64 1, align 16 + // LLVM-NEXT: [[TMP4:%.*]] = alloca <2 x double>, i64 1, align 16 + // LLVM-NEXT: [[TMP5:%.*]] = alloca <2 x double>, i64 1, align 16 + // LLVM-NEXT: store <2 x double> [[TMP0]], ptr [[TMP3]], align 16 + // LLVM-NEXT: store <2 x double> [[TMP1]], ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[TMP3]], align 16 + // LLVM-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP8:%.*]] = fcmp ole <2 x double> [[TMP6]], [[TMP7]] + // LLVM-NEXT: [[TMP9:%.*]] = sext <2 x i1> [[TMP8]] to <2 x i64> + // LLVM-NEXT: [[TMP10:%.*]] = xor <2 x i64> [[TMP9]], splat (i64 -1) + // LLVM-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <2 x double> + // LLVM-NEXT: store <2 x double> [[TMP11]], ptr [[TMP5]], align 16 + // LLVM-NEXT: [[TMP12:%.*]] = load <2 x double>, ptr [[TMP5]], align 16 + // LLVM-NEXT: ret <2 x double> [[TMP12]] + + // OGCG-LABEL: define dso_local <2 x double> @test_cmpnlepd( + // OGCG-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { + // OGCG-NEXT: [[ENTRY:.*:]] + // OGCG-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 + // OGCG-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 + // OGCG-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 + // OGCG-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 + // OGCG-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP2:%.*]] = fcmp ugt <2 x double> [[TMP0]], [[TMP1]] + // OGCG-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> + // OGCG-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double> + // OGCG-NEXT: ret <2 x double> [[TMP4]] + return __builtin_ia32_cmpnlepd(A, B); +} + +__m128 test_cmpnltps(__m128 A, __m128 B) { + // CIR-LABEL: cir.func dso_local @test_cmpnltps( + // CIR-SAME: %[[ARG0:.*]]: !cir.vector<4 x !cir.float> {{.*}}, %[[ARG1:.*]]: !cir.vector<4 x !cir.float> {{.*}}) -> !cir.vector<4 x !cir.float> inline(never) { + // CIR: %[[ALLOCA_0:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr>, ["A", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_1:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr>, ["B", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_2:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr>, ["__retval"] {alignment = 16 : i64} + // CIR: cir.store %[[ARG0]], %[[ALLOCA_0]] : !cir.vector<4 x !cir.float>, !cir.ptr> + // CIR: cir.store %[[ARG1]], %[[ALLOCA_1]] : !cir.vector<4 x !cir.float>, !cir.ptr> + // CIR: %[[LOAD_0:.*]] = cir.load align(16) %[[ALLOCA_0]] : !cir.ptr>, !cir.vector<4 x !cir.float> + // CIR: %[[LOAD_1:.*]] = cir.load align(16) %[[ALLOCA_1]] : !cir.ptr>, !cir.vector<4 x !cir.float> + // CIR: %[[VEC_0:.*]] = cir.vec.cmp(lt, %[[LOAD_0]], %[[LOAD_1]]) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i> + // CIR: %[[UNARY_0:.*]] = cir.unary(not, %[[VEC_0]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i> + // CIR: %[[CAST_0:.*]] = cir.cast bitcast %[[UNARY_0]] : !cir.vector<4 x !s32i> -> !cir.vector<4 x !cir.float> + // CIR: cir.store %[[CAST_0]], %[[ALLOCA_2]] : !cir.vector<4 x !cir.float>, !cir.ptr> + // CIR: %[[LOAD_2:.*]] = cir.load %[[ALLOCA_2]] : !cir.ptr>, !cir.vector<4 x !cir.float> + // CIR: cir.return %[[LOAD_2]] : !cir.vector<4 x !cir.float> + // CIR: } + + // LLVM-LABEL: define dso_local <4 x float> @test_cmpnltps( + // LLVM-SAME: <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { + // LLVM-NEXT: [[TMP3:%.*]] = alloca <4 x float>, i64 1, align 16 + // LLVM-NEXT: [[TMP4:%.*]] = alloca <4 x float>, i64 1, align 16 + // LLVM-NEXT: [[TMP5:%.*]] = alloca <4 x float>, i64 1, align 16 + // LLVM-NEXT: store <4 x float> [[TMP0]], ptr [[TMP3]], align 16 + // LLVM-NEXT: store <4 x float> [[TMP1]], ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP6:%.*]] = load <4 x float>, ptr [[TMP3]], align 16 + // LLVM-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP8:%.*]] = fcmp olt <4 x float> [[TMP6]], [[TMP7]] + // LLVM-NEXT: [[TMP9:%.*]] = sext <4 x i1> [[TMP8]] to <4 x i32> + // LLVM-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], splat (i32 -1) + // LLVM-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <4 x float> + // LLVM-NEXT: store <4 x float> [[TMP11]], ptr [[TMP5]], align 16 + // LLVM-NEXT: [[TMP12:%.*]] = load <4 x float>, ptr [[TMP5]], align 16 + // LLVM-NEXT: ret <4 x float> [[TMP12]] + + // OGCG-LABEL: define dso_local <4 x float> @test_cmpnltps( + // OGCG-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { + // OGCG-NEXT: [[ENTRY:.*:]] + // OGCG-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 + // OGCG-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 + // OGCG-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 + // OGCG-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 + // OGCG-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP2:%.*]] = fcmp uge <4 x float> [[TMP0]], [[TMP1]] + // OGCG-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> + // OGCG-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float> + // OGCG-NEXT: ret <4 x float> [[TMP4]] + return __builtin_ia32_cmpnltps(A, B); +} + +__m128d test_cmpnltpd(__m128d A, __m128d B) { + // CIR-LABEL: cir.func dso_local @test_cmpnltpd( + // CIR: %[[ARG0:.*]]: !cir.vector<2 x !cir.double> {{.*}}, %[[ARG1:.*]]: !cir.vector<2 x !cir.double> {{.*}}) -> !cir.vector<2 x !cir.double> inline(never) { + // CIR: %[[ALLOCA_0:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr>, ["A", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_1:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr>, ["B", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_2:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr>, ["__retval"] {alignment = 16 : i64} + // CIR: cir.store %[[ARG0]], %[[ALLOCA_0]] : !cir.vector<2 x !cir.double>, !cir.ptr> + // CIR: cir.store %[[ARG1]], %[[ALLOCA_1]] : !cir.vector<2 x !cir.double>, !cir.ptr> + // CIR: %[[LOAD_0:.*]] = cir.load align(16) %[[ALLOCA_0]] : !cir.ptr>, !cir.vector<2 x !cir.double> + // CIR: %[[LOAD_1:.*]] = cir.load align(16) %[[ALLOCA_1]] : !cir.ptr>, !cir.vector<2 x !cir.double> + // CIR: %[[VEC_0:.*]] = cir.vec.cmp(lt, %[[LOAD_0]], %[[LOAD_1]]) : !cir.vector<2 x !cir.double>, !cir.vector<2 x !s64i> + // CIR: %[[UNARY_0:.*]] = cir.unary(not, %[[VEC_0]]) : !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i> + // CIR: %[[CAST_0:.*]] = cir.cast bitcast %[[UNARY_0]] : !cir.vector<2 x !s64i> -> !cir.vector<2 x !cir.double> + // CIR: cir.store %[[CAST_0]], %[[ALLOCA_2]] : !cir.vector<2 x !cir.double>, !cir.ptr> + // CIR: %[[LOAD_2:.*]] = cir.load %[[ALLOCA_2]] : !cir.ptr>, !cir.vector<2 x !cir.double> + // CIR: cir.return %[[LOAD_2]] : !cir.vector<2 x !cir.double> + // CIR: } + + // LLVM-LABEL: define dso_local <2 x double> @test_cmpnltpd( + // LLVM-SAME: <2 x double> [[TMP0:%.*]], <2 x double> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { + // LLVM-NEXT: [[TMP3:%.*]] = alloca <2 x double>, i64 1, align 16 + // LLVM-NEXT: [[TMP4:%.*]] = alloca <2 x double>, i64 1, align 16 + // LLVM-NEXT: [[TMP5:%.*]] = alloca <2 x double>, i64 1, align 16 + // LLVM-NEXT: store <2 x double> [[TMP0]], ptr [[TMP3]], align 16 + // LLVM-NEXT: store <2 x double> [[TMP1]], ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[TMP3]], align 16 + // LLVM-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP6]], [[TMP7]] + // LLVM-NEXT: [[TMP9:%.*]] = sext <2 x i1> [[TMP8]] to <2 x i64> + // LLVM-NEXT: [[TMP10:%.*]] = xor <2 x i64> [[TMP9]], splat (i64 -1) + // LLVM-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <2 x double> + // LLVM-NEXT: store <2 x double> [[TMP11]], ptr [[TMP5]], align 16 + // LLVM-NEXT: [[TMP12:%.*]] = load <2 x double>, ptr [[TMP5]], align 16 + // LLVM-NEXT: ret <2 x double> [[TMP12]] + + // OGCG-LABEL: define dso_local <2 x double> @test_cmpnltpd( + // OGCG-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { + // OGCG-NEXT: [[ENTRY:.*:]] + // OGCG-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 + // OGCG-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 + // OGCG-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 + // OGCG-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 + // OGCG-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP2:%.*]] = fcmp uge <2 x double> [[TMP0]], [[TMP1]] + // OGCG-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> + // OGCG-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double> + // OGCG-NEXT: ret <2 x double> [[TMP4]] + return __builtin_ia32_cmpnltpd(A, B); +} diff --git a/clang/test/CodeGen/X86/avx10_2bf16-builtins.c b/clang/test/CodeGen/X86/avx10_2bf16-builtins.c index f8a4c51d9ceb3..fac7ef2e2bf29 100644 --- a/clang/test/CodeGen/X86/avx10_2bf16-builtins.c +++ b/clang/test/CodeGen/X86/avx10_2bf16-builtins.c @@ -1,7 +1,11 @@ // RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64 -target-feature +avx10.2 -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror | FileCheck %s -// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i386 -target-feature +avx10.2 -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i386 -target-feature +avx10.2 -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror | FileCheck %s + +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64 -target-feature +avx10.2 -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i386 -target-feature +avx10.2 -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s #include +#include "builtin_test_helpers.h" __m256bh test_mm256_setzero_pbh() { // CHECK-LABEL: @test_mm256_setzero_pbh @@ -353,6 +357,7 @@ __m128bh test_mm_move_sbh(__m128bh A, __m128bh B) { // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat %{{.*}}, i32 0 return _mm_move_sbh(A, B); } +TEST_CONSTEXPR(match_m128bh(_mm_move_sbh((__m128bh)(__v8bf){1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f}, (__m128bh)(__v8bf){9.0f,10.0f,11.0f,12.0f,13.0f,14.0f,15.0f,16.0f}), 9.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f)); __m128bh test_mm_mask_move_sbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { // CHECK-LABEL: @test_mm_mask_move_sbh @@ -366,6 +371,7 @@ __m128bh test_mm_mask_move_sbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128b // CHECK-NEXT: insertelement <8 x bfloat> [[VEC]], bfloat [[SEL]], i64 0 return _mm_mask_move_sbh(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_m128bh(_mm_mask_move_sbh((__m128bh)(__v8bf){1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f}, (__mmask8)0x01, (__m128bh)(__v8bf){100.0f,200.0f,300.0f,400.0f,500.0f,600.0f,700.0f,800.0f}, (__m128bh)(__v8bf){9.0f,10.0f,11.0f,12.0f,13.0f,14.0f,15.0f,16.0f}), 9.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f)); __m128bh test_mm_maskz_move_sbh(__mmask8 __U, __m128bh __A, __m128bh __B) { // CHECK-LABEL: @test_mm_maskz_move_sbh diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index ec813e5acd7cf..eb25aa538e9a3 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -3137,6 +3137,7 @@ __m512i test_mm512_maskz_mul_epi32 (__mmask8 __k,__m512i __A, __m512i __B) { //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_maskz_mul_epi32(__k,__A,__B); } +TEST_CONSTEXPR(match_m512i(_mm512_maskz_mul_epi32((__mmask8)0b11110000, (__m512i){1, 2, 3, 4, 5, 6, 7, 8}, (__m512i){10, 20, 30, 40, 50, 60, 70, 80}), 0, 0, 0, 0, 250, 360, 490, 640)); __m512i test_mm512_mask_mul_epi32 (__mmask8 __k,__m512i __A, __m512i __B, __m512i __src) { //CHECK-LABEL: test_mm512_mask_mul_epi32 @@ -3148,6 +3149,7 @@ __m512i test_mm512_mask_mul_epi32 (__mmask8 __k,__m512i __A, __m512i __B, __m512 //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_mask_mul_epi32(__src,__k,__A,__B); } +TEST_CONSTEXPR(match_m512i(_mm512_mask_mul_epi32((__m512i){1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000}, (__mmask8)0b11110000, (__m512i){1, 2, 3, 4, 5, 6, 7, 8}, (__m512i){10, 20, 30, 40, 50, 60, 70, 80}), 1000, 2000, 3000, 4000, 250, 360, 490, 640)); __m512i test_mm512_mul_epu32 (__m512i __A, __m512i __B) { //CHECK-LABEL: test_mm512_mul_epu32 @@ -3166,6 +3168,7 @@ __m512i test_mm512_maskz_mul_epu32 (__mmask8 __k,__m512i __A, __m512i __B) { //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_maskz_mul_epu32(__k,__A,__B); } +TEST_CONSTEXPR(match_m512i(_mm512_maskz_mul_epu32((__mmask8)0b11110000, (__m512i){1, 2, 3, 4, 5, 6, 7, 8}, (__m512i){10, 20, 30, 40, 50, 60, 70, 80}), 0, 0, 0, 0, 250, 360, 490, 640)); __m512i test_mm512_mask_mul_epu32 (__mmask8 __k,__m512i __A, __m512i __B, __m512i __src) { //CHECK-LABEL: test_mm512_mask_mul_epu32 @@ -3175,6 +3178,7 @@ __m512i test_mm512_mask_mul_epu32 (__mmask8 __k,__m512i __A, __m512i __B, __m512 //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_mask_mul_epu32(__src,__k,__A,__B); } +TEST_CONSTEXPR(match_m512i(_mm512_mask_mul_epu32((__m512i){1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000}, (__mmask8)0b11110000, (__m512i){1, 2, 3, 4, 5, 6, 7, 8}, (__m512i){10, 20, 30, 40, 50, 60, 70, 80}), 1000, 2000, 3000, 4000, 250, 360, 490, 640)); __m512i test_mm512_maskz_mullo_epi32 (__mmask16 __k,__m512i __A, __m512i __B) { //CHECK-LABEL: test_mm512_maskz_mullo_epi32 @@ -3237,12 +3241,16 @@ __m512d test_mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d _ // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_add_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m512d(_mm512_mask_add_pd((__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, (__mmask8)0b11110000, (__m512d){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}, (__m512d){100.0, 200.0, 300.0, 400.0, 500.0, 600.0, 700.0, 800.0}), 1.0, 2.0, 3.0, 4.0, 550.0, 660.0, 770.0, 880.0)); + __m512d test_mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: test_mm512_maskz_add_pd // CHECK: fadd <8 x double> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_add_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m512d(_mm512_maskz_add_pd((__mmask8)0b11110000, (__m512d){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}, (__m512d){100.0, 200.0, 300.0, 400.0, 500.0, 600.0, 700.0, 800.0}), 0.0, 0.0, 0.0, 0.0, 550.0, 660.0, 770.0, 880.0)); + __m512 test_mm512_add_round_ps(__m512 __A, __m512 __B) { // CHECK-LABEL: test_mm512_add_round_ps // CHECK: @llvm.x86.avx512.add.ps.512 @@ -3266,12 +3274,16 @@ __m512 test_mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_add_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m512(_mm512_mask_add_ps((__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}, (__mmask16)0b1111111100000000, (__m512){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f, 110.0f, 120.0f, 130.0f, 140.0f, 150.0f, 160.0f}, (__m512){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f, 900.0f, 1000.0f, 1100.0f, 1200.0f, 1300.0f, 1400.0f, 1500.0f, 1600.0f}), 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 990.0f, 1100.0f, 1210.0f, 1320.0f, 1430.0f, 1540.0f, 1650.0f, 1760.0f)); + __m512 test_mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: test_mm512_maskz_add_ps // CHECK: fadd <16 x float> %{{.*}}, %{{.*}} // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_add_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m512(_mm512_maskz_add_ps((__mmask16)0b1111111100000000, (__m512){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f, 110.0f, 120.0f, 130.0f, 140.0f, 150.0f, 160.0f}, (__m512){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f, 900.0f, 1000.0f, 1100.0f, 1200.0f, 1300.0f, 1400.0f, 1500.0f, 1600.0f}), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 990.0f, 1100.0f, 1210.0f, 1320.0f, 1430.0f, 1540.0f, 1650.0f, 1760.0f)); + __m128 test_mm_add_round_ss(__m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_add_round_ss // CHECK: @llvm.x86.avx512.mask.add.ss.round @@ -3302,6 +3314,8 @@ __m128 test_mm_mask_add_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_mask_add_ss(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_mask_add_ss((__m128)(__v4sf){10.0f, 100.0f, 200.0f, 300.0f}, 0x1,(__m128)(__v4sf){1.25f, 3.0f, 4.0f, 5.0f},(__m128)(__v4sf){2.75f, 6.0f, 7.0f, 8.0f}),4.0f, 100.0f, 200.0f, 300.0f)); + __m128 test_mm_maskz_add_ss(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_maskz_add_ss // CHECK-NOT: @llvm.x86.avx512.mask.add.ss.round @@ -3317,6 +3331,8 @@ __m128 test_mm_maskz_add_ss(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_maskz_add_ss(__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_maskz_add_ss(0x1, (__m128)(__v4sf){1.25f, 3.0f, 4.0f, 5.0f}, (__m128)(__v4sf){2.75f, 6.0f, 7.0f, 8.0f}), 4.0f, 0.0f, 0.0f, 0.0f)); + __m128d test_mm_add_round_sd(__m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_add_round_sd // CHECK: @llvm.x86.avx512.mask.add.sd.round @@ -3347,6 +3363,8 @@ __m128d test_mm_mask_add_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_mask_add_sd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_mask_add_sd((__m128d)(__v2df){10.0, 999.0}, 0x1, (__m128d)(__v2df){5.5, 77.0}, (__m128d)(__v2df){0.25, 88.0}), 5.75, 999.0)); + __m128d test_mm_maskz_add_sd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_maskz_add_sd // CHECK-NOT: @llvm.x86.avx512.mask.add.sd.round @@ -3362,6 +3380,8 @@ __m128d test_mm_maskz_add_sd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_maskz_add_sd(__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_maskz_add_sd(0x1, (__m128d)(__v2df){5.5, 77.0}, (__m128d)(__v2df){0.25, 88.0}), 5.75, 0.0)); + __m512d test_mm512_sub_round_pd(__m512d __A, __m512d __B) { // CHECK-LABEL: test_mm512_sub_round_pd // CHECK: @llvm.x86.avx512.sub.pd.512 @@ -3385,12 +3405,16 @@ __m512d test_mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d _ // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_sub_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m512d(_mm512_mask_sub_pd((__m512d){100.0, 200.0, 300.0, 400.0, 500.0, 600.0, 700.0, 800.0}, (__mmask8)0b11110000, (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, (__m512d){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 100.0, 200.0, 300.0, 400.0, -45.0, -54.0, -63.0, -72.0)); + __m512d test_mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: test_mm512_maskz_sub_pd // CHECK: fsub <8 x double> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_sub_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m512d(_mm512_maskz_sub_pd((__mmask8)0b11110000, (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, (__m512d){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 0.0, 0.0, 0.0, 0.0, -45.0, -54.0, -63.0, -72.0)); + __m512 test_mm512_sub_round_ps(__m512 __A, __m512 __B) { // CHECK-LABEL: test_mm512_sub_round_ps // CHECK: @llvm.x86.avx512.sub.ps.512 @@ -3414,12 +3438,16 @@ __m512 test_mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_sub_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m512(_mm512_mask_sub_ps((__m512){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f, 900.0f, 1000.0f, 1100.0f, 1200.0f, 1300.0f, 1400.0f, 1500.0f, 1600.0f}, (__mmask16)0b1111111100000000, (__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}, (__m512){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f, 110.0f, 120.0f, 130.0f, 140.0f, 150.0f, 160.0f}), 100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f, -81.0f, -90.0f, -99.0f, -108.0f, -117.0f, -126.0f, -135.0f, -144.0f)); + __m512 test_mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: test_mm512_maskz_sub_ps // CHECK: fsub <16 x float> %{{.*}}, %{{.*}} // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_sub_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m512(_mm512_maskz_sub_ps((__mmask16)0b1111111100000000, (__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}, (__m512){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f, 110.0f, 120.0f, 130.0f, 140.0f, 150.0f, 160.0f}), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -81.0f, -90.0f, -99.0f, -108.0f, -117.0f, -126.0f, -135.0f, -144.0f)); + __m128 test_mm_sub_round_ss(__m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_sub_round_ss // CHECK: @llvm.x86.avx512.mask.sub.ss.round @@ -3450,6 +3478,8 @@ __m128 test_mm_mask_sub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_mask_sub_ss(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_mask_sub_ss((__m128)(__v4sf){-1.0f, 10.0f, 20.0f, 30.0f}, 0x1, (__m128)(__v4sf){7.0f, 3.0f, 4.0f, 5.0f}, (__m128)(__v4sf){2.5f, 6.0f, 7.0f, 8.0f}), 4.5f, 10.0f, 20.0f, 30.0f)); + __m128 test_mm_maskz_sub_ss(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_maskz_sub_ss // CHECK-NOT: @llvm.x86.avx512.mask.sub.ss.round @@ -3465,6 +3495,8 @@ __m128 test_mm_maskz_sub_ss(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_maskz_sub_ss(__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_maskz_sub_ss(0x1, (__m128)(__v4sf){7.0f, 3.0f, 4.0f, 5.0f}, (__m128)(__v4sf){2.5f, 6.0f, 7.0f, 8.0f}), 4.5f, 0.0f, 0.0f, 0.0f)); + __m128d test_mm_sub_round_sd(__m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_sub_round_sd // CHECK: @llvm.x86.avx512.mask.sub.sd.round @@ -3495,6 +3527,8 @@ __m128d test_mm_mask_sub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_mask_sub_sd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_mask_sub_sd((__m128d)(__v2df){-1.0, 111.0}, 0x1, (__m128d)(__v2df){9.0, 70.0}, (__m128d)(__v2df){3.5, 80.0}), 5.5, 111.0)); + __m128d test_mm_maskz_sub_sd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_maskz_sub_sd // CHECK-NOT: @llvm.x86.avx512.mask.sub.sd.round @@ -3510,6 +3544,8 @@ __m128d test_mm_maskz_sub_sd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_maskz_sub_sd(__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_maskz_sub_sd(0x1, (__m128d)(__v2df){9.0, 70.0}, (__m128d)(__v2df){3.5, 80.0}), 5.5, 0.0)); + __m512d test_mm512_mul_round_pd(__m512d __A, __m512d __B) { // CHECK-LABEL: test_mm512_mul_round_pd // CHECK: @llvm.x86.avx512.mul.pd.512 @@ -3533,12 +3569,16 @@ __m512d test_mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d _ // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_mul_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m512d(_mm512_mask_mul_pd((__m512d){100.0, 200.0, 300.0, 400.0, 500.0, 600.0, 700.0, 800.0}, (__mmask8)0b11110000, (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, (__m512d){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 100.0, 200.0, 300.0, 400.0, 250.0, 360.0, 490.0, 640.0)); + __m512d test_mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: test_mm512_maskz_mul_pd // CHECK: fmul <8 x double> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_mul_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m512d(_mm512_maskz_mul_pd((__mmask8)0b11110000, (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, (__m512d){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 0.0, 0.0, 0.0, 0.0, 250.0, 360.0, 490.0, 640.0)); + __m512 test_mm512_mul_round_ps(__m512 __A, __m512 __B) { // CHECK-LABEL: test_mm512_mul_round_ps // CHECK: @llvm.x86.avx512.mul.ps.512 @@ -3562,12 +3602,16 @@ __m512 test_mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_mul_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m512(_mm512_mask_mul_ps((__m512){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f, 900.0f, 1000.0f, 1100.0f, 1200.0f, 1300.0f, 1400.0f, 1500.0f, 1600.0f}, (__mmask16)0b1111111100000000, (__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}, (__m512){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f, 110.0f, 120.0f, 130.0f, 140.0f, 150.0f, 160.0f}), 100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f, 810.0f, 1000.0f, 1210.0f, 1440.0f, 1690.0f, 1960.0f, 2250.0f, 2560.0f)); + __m512 test_mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: test_mm512_maskz_mul_ps // CHECK: fmul <16 x float> %{{.*}}, %{{.*}} // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_mul_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m512(_mm512_maskz_mul_ps((__mmask16)0b1111111100000000, (__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}, (__m512){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f, 110.0f, 120.0f, 130.0f, 140.0f, 150.0f, 160.0f}), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 810.0f, 1000.0f, 1210.0f, 1440.0f, 1690.0f, 1960.0f, 2250.0f, 2560.0f)); + __m128 test_mm_mul_round_ss(__m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_mul_round_ss // CHECK: @llvm.x86.avx512.mask.mul.ss.round @@ -3598,6 +3642,8 @@ __m128 test_mm_mask_mul_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_mask_mul_ss(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_mask_mul_ss((__m128)(__v4sf){42.0f, -1.0f, -2.0f, -3.0f}, 0x1, (__m128)(__v4sf){6.0f, 9.0f, 9.0f, 9.0f}, (__m128)(__v4sf){7.0f, 8.0f, 8.0f, 8.0f}), 42.0f, -1.0f, -2.0f, -3.0f)); + __m128 test_mm_maskz_mul_ss(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_maskz_mul_ss // CHECK-NOT: @llvm.x86.avx512.mask.mul.ss.round @@ -3613,6 +3659,8 @@ __m128 test_mm_maskz_mul_ss(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_maskz_mul_ss(__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_maskz_mul_ss(0x1, (__m128)(__v4sf){6.0f, 9.0f, 9.0f, 9.0f}, (__m128)(__v4sf){7.0f, 8.0f, 8.0f, 8.0f}), 42.0f, 0.0f, 0.0f, 0.0f)); + __m128d test_mm_mul_round_sd(__m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_mul_round_sd // CHECK: @llvm.x86.avx512.mask.mul.sd.round @@ -3643,6 +3691,8 @@ __m128d test_mm_mask_mul_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_mask_mul_sd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_mask_mul_sd((__m128d)(__v2df){123.0, -9.0}, 0x1, (__m128d)(__v2df){2.5, 1.0}, (__m128d)(__v2df){4.0, 2.0}), 10.0, -9.0)); + __m128d test_mm_maskz_mul_sd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_maskz_mul_sd // CHECK-NOT: @llvm.x86.avx512.mask.mul.sd.round @@ -3658,6 +3708,8 @@ __m128d test_mm_maskz_mul_sd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_maskz_mul_sd(__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_maskz_mul_sd(0x1, (__m128d)(__v2df){2.5, 1.0}, (__m128d)(__v2df){4.0, 2.0}), 10.0, 0.0)); + __m512d test_mm512_div_round_pd(__m512d __A, __m512d __B) { // CHECK-LABEL: test_mm512_div_round_pd // CHECK: @llvm.x86.avx512.div.pd.512 @@ -3687,12 +3739,16 @@ __m512d test_mm512_mask_div_pd(__m512d __w, __mmask8 __u, __m512d __a, __m512d _ // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_div_pd(__w,__u,__a,__b); } +TEST_CONSTEXPR(match_m512d(_mm512_mask_div_pd((__m512d){100.0, 200.0, 300.0, 400.0, 500.0, 600.0, 700.0, 800.0}, (__mmask8)0b11110000, (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, (__m512d){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 100.0, 200.0, 300.0, 400.0, 0.1, 0.1, 0.1, 0.1)); + __m512d test_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: test_mm512_maskz_div_pd // CHECK: fdiv <8 x double> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_div_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m512d(_mm512_maskz_div_pd((__mmask8)0b11110000, (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, (__m512d){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 0.0, 0.0, 0.0, 0.0, 0.1, 0.1, 0.1, 0.1)); + __m512 test_mm512_div_round_ps(__m512 __A, __m512 __B) { // CHECK-LABEL: test_mm512_div_round_ps // CHECK: @llvm.x86.avx512.div.ps.512 @@ -3722,12 +3778,16 @@ __m512 test_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_div_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m512(_mm512_mask_div_ps((__m512){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f, 900.0f, 1000.0f, 1100.0f, 1200.0f, 1300.0f, 1400.0f, 1500.0f, 1600.0f}, (__mmask16)0b1111111100000000, (__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}, (__m512){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f, 110.0f, 120.0f, 130.0f, 140.0f, 150.0f, 160.0f}), 100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f)); + __m512 test_mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: test_mm512_maskz_div_ps // CHECK: fdiv <16 x float> %{{.*}}, %{{.*}} // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_div_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m512(_mm512_maskz_div_ps((__mmask16)0b1111111100000000, (__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}, (__m512){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f, 110.0f, 120.0f, 130.0f, 140.0f, 150.0f, 160.0f}), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f)); + __m128 test_mm_div_round_ss(__m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_div_round_ss // CHECK: @llvm.x86.avx512.mask.div.ss.round @@ -3757,6 +3817,8 @@ __m128 test_mm_mask_div_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_mask_div_ss(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_mask_div_ss((__m128)(__v4sf){-7.0f, 5.0f, 6.0f, 7.0f}, 0x1, (__m128)(__v4sf){9.0f, 1.0f, 1.0f, 1.0f}, (__m128)(__v4sf){3.0f, 2.0f, 2.0f, 2.0f}), 3.0f, 5.0f, 6.0f, 7.0f)); + __m128 test_mm_maskz_div_ss(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_maskz_div_ss // CHECK: extractelement <4 x float> %{{.*}}, i32 0 @@ -3771,6 +3833,8 @@ __m128 test_mm_maskz_div_ss(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_maskz_div_ss(__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_maskz_div_ss(0x1, (__m128)(__v4sf){9.0f, 1.0f, 1.0f, 1.0f}, (__m128)(__v4sf){3.0f, 2.0f, 2.0f, 2.0f}), 3.0f, 0.0f, 0.0f, 0.0f)); + __m128d test_mm_div_round_sd(__m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_div_round_sd // CHECK: @llvm.x86.avx512.mask.div.sd.round @@ -3800,6 +3864,8 @@ __m128d test_mm_mask_div_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_mask_div_sd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_mask_div_sd((__m128d)(__v2df){-8.0, 44.0}, 0x1, (__m128d)(__v2df){8.0, 10.0}, (__m128d)(__v2df){2.0, 20.0}), 4.0, 44.0)); + __m128d test_mm_maskz_div_sd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_maskz_div_sd // CHECK: extractelement <2 x double> %{{.*}}, i32 0 @@ -3814,6 +3880,8 @@ __m128d test_mm_maskz_div_sd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_maskz_div_sd(__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_maskz_div_sd(0x1, (__m128d)(__v2df){8.0, 10.0}, (__m128d)(__v2df){2.0, 20.0}), 4.0, 0.0)); + __m128 test_mm_max_round_ss(__m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_max_round_ss // CHECK: @llvm.x86.avx512.mask.max.ss.round @@ -11673,6 +11741,7 @@ __m128 test_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) // CHECK-NEXT: insertelement <4 x float> [[VEC]], float [[SEL]], i64 0 return _mm_mask_move_ss ( __W, __U, __A, __B); } +TEST_CONSTEXPR(match_m128(_mm_mask_move_ss((__m128)(__v4sf){1.0f,2.0f,3.0f,4.0f}, (__mmask8)0x01, (__m128)(__v4sf){100.0f,200.0f,300.0f,400.0f}, (__m128)(__v4sf){9.0f,10.0f,11.0f,12.0f}), 9.0f,2.0f,3.0f,4.0f)); __m128 test_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) { @@ -11687,6 +11756,7 @@ __m128 test_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) // CHECK-NEXT: insertelement <4 x float> [[VEC]], float [[SEL]], i64 0 return _mm_maskz_move_ss (__U, __A, __B); } +TEST_CONSTEXPR(match_m128(_mm_maskz_move_ss((__mmask8)0x01, (__m128)(__v4sf){0.0f,0.0f,0.0f,0.0f}, (__m128)(__v4sf){9.0f,10.0f,11.0f,12.0f}), 9.0f,0.0f,0.0f,0.0f)); __m128d test_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { @@ -11701,6 +11771,7 @@ __m128d test_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __ // CHECK-NEXT: insertelement <2 x double> [[VEC]], double [[SEL]], i64 0 return _mm_mask_move_sd ( __W, __U, __A, __B); } +TEST_CONSTEXPR(match_m128d(_mm_mask_move_sd((__m128d)(__v2df){1.0,2.0}, (__mmask8)0x01, (__m128d)(__v2df){100.0,200.0}, (__m128d)(__v2df){9.0,10.0}), 9.0,2.0)); __m128d test_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) { @@ -11715,6 +11786,7 @@ __m128d test_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) // CHECK-NEXT: insertelement <2 x double> [[VEC]], double [[SEL]], i64 0 return _mm_maskz_move_sd (__U, __A, __B); } +TEST_CONSTEXPR(match_m128d(_mm_maskz_move_sd((__mmask8)0x01, (__m128d)(__v2df){0.0,0.0}, (__m128d)(__v2df){9.0,10.0}), 9.0,0.0)); void test_mm_mask_store_ss(float * __P, __mmask8 __U, __m128 __A) { diff --git a/clang/test/CodeGen/X86/avx512fp16-builtins.c b/clang/test/CodeGen/X86/avx512fp16-builtins.c index f0a0a3b28542f..1c8ab8ca52099 100644 --- a/clang/test/CodeGen/X86/avx512fp16-builtins.c +++ b/clang/test/CodeGen/X86/avx512fp16-builtins.c @@ -796,6 +796,8 @@ __m128h test_mm_mask_add_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 return _mm_mask_add_sh(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_m128h(_mm_mask_add_sh((__m128h)(__v8hf){1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f},(__mmask8)0x01,(__m128h)(__v8hf){10.0f,20.0f,30.0f,40.0f,50.0f,60.0f,70.0f,80.0f},(__m128h)(__v8hf){100.0f,200.0f,300.0f,400.0f,500.0f,600.0f,700.0f,800.0f}),110.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f)); + __m128h test_mm_maskz_add_sh(__mmask8 __U, __m128h __A, __m128h __B) { // CHECK-LABEL: test_mm_maskz_add_sh // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 @@ -810,6 +812,7 @@ __m128h test_mm_maskz_add_sh(__mmask8 __U, __m128h __A, __m128h __B) { // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 return _mm_maskz_add_sh(__U, __A, __B); } +TEST_CONSTEXPR(match_m128h(_mm_maskz_add_sh((__mmask8)0x01,(__m128h)(__v8hf){10.0f,20.0f,30.0f,40.0f,50.0f,60.0f,70.0f,80.0f},(__m128h)(__v8hf){100.0f,200.0f,300.0f,400.0f,500.0f,600.0f,700.0f,800.0f}),110.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f)); __m128h test_mm_add_sh(__m128h __A, __m128h __B) { // CHECK-LABEL: test_mm_add_sh @@ -849,6 +852,8 @@ __m128h test_mm_mask_sub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 return _mm_mask_sub_sh(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_m128h(_mm_mask_sub_sh((__m128h)(__v8hf){1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f},(__mmask8)0x01,(__m128h)(__v8hf){20.0f,21.0f,22.0f,23.0f,24.0f,25.0f,26.0f,27.0f},(__m128h)(__v8hf){5.0f,6.0f,7.0f,8.0f,9.0f,10.0f,11.0f,12.0f}),15.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f)); + __m128h test_mm_maskz_sub_sh(__mmask8 __U, __m128h __A, __m128h __B) { // CHECK-LABEL: test_mm_maskz_sub_sh // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 @@ -863,6 +868,7 @@ __m128h test_mm_maskz_sub_sh(__mmask8 __U, __m128h __A, __m128h __B) { // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 return _mm_maskz_sub_sh(__U, __A, __B); } +TEST_CONSTEXPR(match_m128h(_mm_maskz_sub_sh((__mmask8)0x01,(__m128h)(__v8hf){20.0f,21.0f,22.0f,23.0f,24.0f,25.0f,26.0f,27.0f},(__m128h)(__v8hf){5.0f,6.0f,7.0f,8.0f,9.0f,10.0f,11.0f,12.0f}),15.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f)); __m128h test_mm_sub_sh(__m128h __A, __m128h __B) { // CHECK-LABEL: test_mm_sub_sh @@ -902,6 +908,8 @@ __m128h test_mm_mask_mul_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 return _mm_mask_mul_sh(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_m128h(_mm_mask_mul_sh((__m128h)(__v8hf){1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f},(__mmask8)0x01,(__m128h)(__v8hf){3.0f,4.0f,5.0f,6.0f,7.0f,8.0f,9.0f,10.0f},(__m128h)(__v8hf){4.0f,5.0f,6.0f,7.0f,8.0f,9.0f,10.0f,11.0f}),12.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f)); + __m128h test_mm_maskz_mul_sh(__mmask8 __U, __m128h __A, __m128h __B) { // CHECK-LABEL: test_mm_maskz_mul_sh // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 @@ -916,6 +924,7 @@ __m128h test_mm_maskz_mul_sh(__mmask8 __U, __m128h __A, __m128h __B) { // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 return _mm_maskz_mul_sh(__U, __A, __B); } +TEST_CONSTEXPR(match_m128h(_mm_maskz_mul_sh((__mmask8)0x01,(__m128h)(__v8hf){3.0f,4.0f,5.0f,6.0f,7.0f,8.0f,9.0f,10.0f},(__m128h)(__v8hf){4.0f,5.0f,6.0f,7.0f,8.0f,9.0f,10.0f,11.0f}),12.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f)); __m128h test_mm_mul_sh(__m128h __A, __m128h __B) { // CHECK-LABEL: test_mm_mul_sh @@ -955,6 +964,8 @@ __m128h test_mm_mask_div_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 return _mm_mask_div_sh(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_m128h(_mm_mask_div_sh((__m128h)(__v8hf){1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f},(__mmask8)0x01,(__m128h)(__v8hf){8.0f,9.0f,10.0f,11.0f,12.0f,13.0f,14.0f,15.0f},(__m128h)(__v8hf){4.0f,3.0f,2.0f,1.0f,2.0f,3.0f,4.0f,5.0f}),2.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f)); + __m128h test_mm_maskz_div_sh(__mmask8 __U, __m128h __A, __m128h __B) { // CHECK-LABEL: test_mm_maskz_div_sh // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 @@ -969,6 +980,7 @@ __m128h test_mm_maskz_div_sh(__mmask8 __U, __m128h __A, __m128h __B) { // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 return _mm_maskz_div_sh(__U, __A, __B); } +TEST_CONSTEXPR(match_m128h(_mm_maskz_div_sh((__mmask8)0x01,(__m128h)(__v8hf){8.0f,9.0f,10.0f,11.0f,12.0f,13.0f,14.0f,15.0f},(__m128h)(__v8hf){4.0f,3.0f,2.0f,1.0f,2.0f,3.0f,4.0f,5.0f}),2.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f)); __m128h test_mm_div_sh(__m128h __A, __m128h __B) { // CHECK-LABEL: test_mm_div_sh @@ -1622,6 +1634,7 @@ __m128h test_mm_mask_move_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B // CHECK-NEXT: insertelement <8 x half> [[VEC]], half [[SEL]], i64 0 return _mm_mask_move_sh(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_m128h(_mm_mask_move_sh((__m128h)(__v8hf){1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f}, (__mmask8)0x01, (__m128h)(__v8hf){10.0f,20.0f,30.0f,40.0f,50.0f,60.0f,70.0f,80.0f}, (__m128h)(__v8hf){100.0f,200.0f,300.0f,400.0f,500.0f,600.0f,700.0f,800.0f}), 100.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f)); __m128h test_mm_maskz_move_sh(__mmask8 __U, __m128h __A, __m128h __B) { // CHECK-LABEL: test_mm_maskz_move_sh @@ -1635,6 +1648,7 @@ __m128h test_mm_maskz_move_sh(__mmask8 __U, __m128h __A, __m128h __B) { // CHECK-NEXT: insertelement <8 x half> [[VEC]], half [[SEL]], i64 0 return _mm_maskz_move_sh(__U, __A, __B); } +TEST_CONSTEXPR(match_m128h(_mm_maskz_move_sh((__mmask8)0x01, (__m128h)(__v8hf){10.0f,20.0f,30.0f,40.0f,50.0f,60.0f,70.0f,80.0f}, (__m128h)(__v8hf){100.0f,200.0f,300.0f,400.0f,500.0f,600.0f,700.0f,800.0f}), 100.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f)); short test_mm_cvtsi128_si16(__m128i A) { // CHECK-LABEL: test_mm_cvtsi128_si16 diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index a7eee79c97539..e05b1ddf7b69a 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -770,6 +770,7 @@ __m256i test_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X, //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask_mul_epi32(__W, __M, __X, __Y); } +TEST_CONSTEXPR(match_m256i(_mm256_mask_mul_epi32((__m256i){100,200,300,400}, (__mmask8)0b00001100, (__m256i){1,2,3,4}, (__m256i){10,20,30,40}), 100,200,90,160)); __m256i test_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) { //CHECK-LABEL: test_mm256_maskz_mul_epi32 @@ -781,7 +782,7 @@ __m256i test_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) { //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_maskz_mul_epi32(__M, __X, __Y); } - +TEST_CONSTEXPR(match_m256i(_mm256_maskz_mul_epi32((__mmask8)0b00001100, (__m256i){1,2,3,4}, (__m256i){10,20,30,40}), 0,0,90,160)); __m128i test_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { @@ -794,6 +795,7 @@ __m128i test_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X, //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_mask_mul_epi32(__W, __M, __X, __Y); } +TEST_CONSTEXPR(match_m128i(_mm_mask_mul_epi32((__m128i){100,200}, (__mmask8)0b00000001, (__m128i){1,2}, (__m128i){10,20}), 10,200)); __m128i test_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y) { //CHECK-LABEL: test_mm_maskz_mul_epi32 @@ -805,6 +807,7 @@ __m128i test_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y) { //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_maskz_mul_epi32(__M, __X, __Y); } +TEST_CONSTEXPR(match_m128i(_mm_maskz_mul_epi32((__mmask8)0b00000010, (__m128i){1,2}, (__m128i){10,20}), 0,40)); __m256i test_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { @@ -815,6 +818,7 @@ __m256i test_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X, //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask_mul_epu32(__W, __M, __X, __Y); } +TEST_CONSTEXPR(match_m256i(_mm256_mask_mul_epu32((__m256i){100,200,300,400}, (__mmask8)0b00001100, (__m256i){1,2,3,4}, (__m256i){10,20,30,40}), 100,200,90,160)); __m256i test_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y) { //CHECK-LABEL: test_mm256_maskz_mul_epu32 @@ -824,6 +828,7 @@ __m256i test_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y) { //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_maskz_mul_epu32(__M, __X, __Y); } +TEST_CONSTEXPR(match_m256i(_mm256_maskz_mul_epu32((__mmask8)0b00001100, (__m256i){1,2,3,4}, (__m256i){10,20,30,40}), 0,0,90,160)); __m128i test_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { @@ -834,6 +839,7 @@ __m128i test_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X, //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_mask_mul_epu32(__W, __M, __X, __Y); } +TEST_CONSTEXPR(match_m128i(_mm_mask_mul_epu32((__m128i){100,200}, (__mmask8)0b00000001, (__m128i){1,2}, (__m128i){10,20}), 10,200)); __m128i test_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y) { //CHECK-LABEL: test_mm_maskz_mul_epu32 @@ -843,6 +849,7 @@ __m128i test_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y) { //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_maskz_mul_epu32(__M, __X, __Y); } +TEST_CONSTEXPR(match_m128i(_mm_maskz_mul_epu32((__mmask8)0b00000010, (__m128i){1,2}, (__m128i){10,20}), 0,40)); __m128i test_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B) { //CHECK-LABEL: test_mm_maskz_mullo_epi32 @@ -3606,48 +3613,64 @@ __m128d test_mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_add_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_mask_add_pd((__m128d){1.0, 2.0}, (__mmask8)0b00000001, (__m128d){10.0, 20.0}, (__m128d){100.0, 200.0}), 110.0, 2.0)); + __m128d test_mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_maskz_add_pd // CHECK: fadd <2 x double> %{{.*}}, %{{.*}} // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_add_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_maskz_add_pd((__mmask8)0b00000001, (__m128d){10.0, 20.0}, (__m128d){100.0, 200.0}), 110.0, 0.0)); + __m256d test_mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_mask_add_pd // CHECK: fadd <4 x double> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_add_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m256d(_mm256_mask_add_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__mmask8)0b00001100, (__m256d){10.0, 20.0, 30.0, 40.0}, (__m256d){100.0, 200.0, 300.0, 400.0}), 1.0, 2.0, 330.0, 440.0)); + __m256d test_mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_maskz_add_pd // CHECK: fadd <4 x double> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_add_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m256d(_mm256_maskz_add_pd((__mmask8)0b00001100, (__m256d){10.0, 20.0, 30.0, 40.0}, (__m256d){100.0, 200.0, 300.0, 400.0}), 0.0, 0.0, 330.0, 440.0)); + __m128 test_mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_mask_add_ps // CHECK: fadd <4 x float> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_add_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_mask_add_ps((__m128){1.0f, 2.0f, 3.0f, 4.0f}, (__mmask8)0b00001010, (__m128){10.0f, 20.0f, 30.0f, 40.0f}, (__m128){100.0f, 200.0f, 300.0f, 400.0f}), 1.0f, 220.0f, 3.0f, 440.0f)); + __m128 test_mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_maskz_add_ps // CHECK: fadd <4 x float> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_add_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_maskz_add_ps((__mmask8)0b00001010, (__m128){10.0f, 20.0f, 30.0f, 40.0f}, (__m128){100.0f, 200.0f, 300.0f, 400.0f}), 0.0f, 220.0f, 0.0f, 440.0f)); + __m256 test_mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: test_mm256_mask_add_ps // CHECK: fadd <8 x float> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_add_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m256(_mm256_mask_add_ps((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, (__mmask8)0b11110000, (__m256){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f}, (__m256){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f}), 1.0f, 2.0f, 3.0f, 4.0f, 550.0f, 660.0f, 770.0f, 880.0f)); + __m256 test_mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: test_mm256_maskz_add_ps // CHECK: fadd <8 x float> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_add_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m256(_mm256_maskz_add_ps((__mmask8)0b11110000, (__m256){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f}, (__m256){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f}), 0.0f, 0.0f, 0.0f, 0.0f, 550.0f, 660.0f, 770.0f, 880.0f)); + __m128i test_mm_mask_blend_epi32(__mmask8 __U, __m128i __A, __m128i __W) { // CHECK-LABEL: test_mm_mask_blend_epi32 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} @@ -4352,48 +4375,64 @@ __m128d test_mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_div_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_mask_div_pd((__m128d){1.0, 2.0}, (__mmask8)0b00000001, (__m128d){10.0, 20.0}, (__m128d){100.0, 200.0}), 0.1, 2.0)); + __m128d test_mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_maskz_div_pd // CHECK: fdiv <2 x double> %{{.*}}, %{{.*}} // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_div_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_maskz_div_pd((__mmask8)0b00000001, (__m128d){10.0, 20.0}, (__m128d){100.0, 200.0}), 0.1, 0.0)); + __m256d test_mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_mask_div_pd // CHECK: fdiv <4 x double> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_div_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m256d(_mm256_mask_div_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__mmask8)0b00001100, (__m256d){10.0, 20.0, 30.0, 40.0}, (__m256d){100.0, 200.0, 300.0, 400.0}), 1.0, 2.0, 0.1, 0.1)); + __m256d test_mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_maskz_div_pd // CHECK: fdiv <4 x double> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_div_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m256d(_mm256_maskz_div_pd((__mmask8)0b00001100, (__m256d){10.0, 20.0, 30.0, 40.0}, (__m256d){100.0, 200.0, 300.0, 400.0}), 0.0, 0.0, 0.1, 0.1)); + __m128 test_mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_mask_div_ps // CHECK: fdiv <4 x float> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_div_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_mask_div_ps((__m128){1.0f, 2.0f, 3.0f, 4.0f}, (__mmask8)0b00001010, (__m128){10.0f, 20.0f, 30.0f, 40.0f}, (__m128){100.0f, 200.0f, 300.0f, 400.0f}), 1.0f, 0.1f, 3.0f, 0.1f)); + __m128 test_mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_maskz_div_ps // CHECK: fdiv <4 x float> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_div_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_maskz_div_ps((__mmask8)0b00001010, (__m128){10.0f, 20.0f, 30.0f, 40.0f}, (__m128){100.0f, 200.0f, 300.0f, 400.0f}), 0.0f, 0.1f, 0.0f, 0.1f)); + __m256 test_mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: test_mm256_mask_div_ps // CHECK: fdiv <8 x float> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_div_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m256(_mm256_mask_div_ps((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, (__mmask8)0b11110000, (__m256){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f}, (__m256){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f}), 1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.1f, 0.1f, 0.1f)); + __m256 test_mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: test_mm256_maskz_div_ps // CHECK: fdiv <8 x float> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_div_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m256(_mm256_maskz_div_ps((__mmask8)0b11110000, (__m256){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f}, (__m256){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f}), 0.0f, 0.0f, 0.0f, 0.0f, 0.1f, 0.1f, 0.1f, 0.1f)); + __m128d test_mm_mask_expand_pd(__m128d __W, __mmask8 __U, __m128d __A) { // CHECK-LABEL: test_mm_mask_expand_pd // CHECK: @llvm.x86.avx512.mask.expand @@ -4716,48 +4755,64 @@ __m128d test_mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_mul_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_mask_mul_pd((__m128d){1.0, 2.0}, (__mmask8)0b00000001, (__m128d){10.0, 20.0}, (__m128d){100.0, 200.0}), 1000.0, 2.0)); + __m128d test_mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_maskz_mul_pd // CHECK: fmul <2 x double> %{{.*}}, %{{.*}} // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_mul_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_maskz_mul_pd((__mmask8)0b00000001, (__m128d){10.0, 20.0}, (__m128d){100.0, 200.0}), 1000.0, 0.0)); + __m256d test_mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_mask_mul_pd // CHECK: fmul <4 x double> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_mul_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m256d(_mm256_mask_mul_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__mmask8)0b00001100, (__m256d){10.0, 20.0, 30.0, 40.0}, (__m256d){100.0, 200.0, 300.0, 400.0}), 1.0, 2.0, 9000.0, 16000.0)); + __m256d test_mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_maskz_mul_pd // CHECK: fmul <4 x double> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_mul_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m256d(_mm256_maskz_mul_pd((__mmask8)0b00001100, (__m256d){10.0, 20.0, 30.0, 40.0}, (__m256d){100.0, 200.0, 300.0, 400.0}), 0.0, 0.0, 9000.0, 16000.0)); + __m128 test_mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_mask_mul_ps // CHECK: fmul <4 x float> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_mul_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_mask_mul_ps((__m128){1.0f, 2.0f, 3.0f, 4.0f}, (__mmask8)0b00001010, (__m128){10.0f, 20.0f, 30.0f, 40.0f}, (__m128){100.0f, 200.0f, 300.0f, 400.0f}), 1.0f, 4000.0f, 3.0f, 16000.0f)); + __m128 test_mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_maskz_mul_ps // CHECK: fmul <4 x float> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_mul_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_maskz_mul_ps((__mmask8)0b00001010, (__m128){10.0f, 20.0f, 30.0f, 40.0f}, (__m128){100.0f, 200.0f, 300.0f, 400.0f}), 0.0f, 4000.0f, 0.0f, 16000.0f)); + __m256 test_mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: test_mm256_mask_mul_ps // CHECK: fmul <8 x float> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_mul_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m256(_mm256_mask_mul_ps((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, (__mmask8)0b11110000, (__m256){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f}, (__m256){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f}), 1.0f, 2.0f, 3.0f, 4.0f, 25000.0f, 36000.0f, 49000.0f, 64000.0f)); + __m256 test_mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: test_mm256_maskz_mul_ps // CHECK: fmul <8 x float> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_mul_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m256(_mm256_maskz_mul_ps((__mmask8)0b11110000, (__m256){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f}, (__m256){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f}), 0.0f, 0.0f, 0.0f, 0.0f, 25000.0f, 36000.0f, 49000.0f, 64000.0f)); + __m128i test_mm_mask_abs_epi32(__m128i __W, __mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_mask_abs_epi32 // CHECK: [[ABS:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %{{.*}}, i1 false) @@ -5562,48 +5617,64 @@ __m128d test_mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_sub_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_mask_sub_pd((__m128d){1.0, 2.0}, (__mmask8)0b00000001, (__m128d){10.0, 20.0}, (__m128d){100.0, 200.0}), -90.0, 2.0)); + __m128d test_mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_maskz_sub_pd // CHECK: fsub <2 x double> %{{.*}}, %{{.*}} // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_sub_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_maskz_sub_pd((__mmask8)0b00000001, (__m128d){10.0, 20.0}, (__m128d){100.0, 200.0}), -90.0, 0.0)); + __m256d test_mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_mask_sub_pd // CHECK: fsub <4 x double> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_sub_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m256d(_mm256_mask_sub_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__mmask8)0b00001100, (__m256d){10.0, 20.0, 30.0, 40.0}, (__m256d){100.0, 200.0, 300.0, 400.0}), 1.0, 2.0, -270.0, -360.0)); + __m256d test_mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_maskz_sub_pd // CHECK: fsub <4 x double> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_sub_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m256d(_mm256_maskz_sub_pd((__mmask8)0b00001100, (__m256d){10.0, 20.0, 30.0, 40.0}, (__m256d){100.0, 200.0, 300.0, 400.0}), 0.0, 0.0, -270.0, -360.0)); + __m128 test_mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_mask_sub_ps // CHECK: fsub <4 x float> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_sub_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_mask_sub_ps((__m128){1.0f, 2.0f, 3.0f, 4.0f}, (__mmask8)0b00001010, (__m128){10.0f, 20.0f, 30.0f, 40.0f}, (__m128){100.0f, 200.0f, 300.0f, 400.0f}), 1.0f, -180.0f, 3.0f, -360.0f)); + __m128 test_mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_maskz_sub_ps // CHECK: fsub <4 x float> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_sub_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_maskz_sub_ps((__mmask8)0b00001010, (__m128){10.0f, 20.0f, 30.0f, 40.0f}, (__m128){100.0f, 200.0f, 300.0f, 400.0f}), 0.0f, -180.0f, 0.0f, -360.0f)); + __m256 test_mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: test_mm256_mask_sub_ps // CHECK: fsub <8 x float> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_sub_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m256(_mm256_mask_sub_ps((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, (__mmask8)0b11110000, (__m256){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f}, (__m256){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f}), 1.0f, 2.0f, 3.0f, 4.0f, -450.0f, -540.0f, -630.0f, -720.0f)); + __m256 test_mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: test_mm256_maskz_sub_ps // CHECK: fsub <8 x float> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_sub_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m256(_mm256_maskz_sub_ps((__mmask8)0b11110000, (__m256){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f}, (__m256){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f}), 0.0f, 0.0f, 0.0f, 0.0f, -450.0f, -540.0f, -630.0f, -720.0f)); + __m128i test_mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U, __m128i __B) { // CHECK-LABEL: test_mm_mask2_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.128 diff --git a/clang/test/CodeGen/sanitize-metadata-nosanitize.c b/clang/test/CodeGen/sanitize-metadata-nosanitize.c index f2672d7f89157..74b5c9b03754f 100644 --- a/clang/test/CodeGen/sanitize-metadata-nosanitize.c +++ b/clang/test/CodeGen/sanitize-metadata-nosanitize.c @@ -10,7 +10,7 @@ // CHECK: @llvm.global_ctors = appending global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 2, ptr @__sanitizer_metadata_covered2.module_ctor, ptr @__sanitizer_metadata_covered2.module_ctor }, { i32, ptr, ptr } { i32 2, ptr @__sanitizer_metadata_atomics2.module_ctor, ptr @__sanitizer_metadata_atomics2.module_ctor }] // CHECK: @llvm.global_dtors = appending global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 2, ptr @__sanitizer_metadata_covered2.module_dtor, ptr @__sanitizer_metadata_covered2.module_dtor }, { i32, ptr, ptr } { i32 2, ptr @__sanitizer_metadata_atomics2.module_dtor, ptr @__sanitizer_metadata_atomics2.module_dtor }] //. -// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) +// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) // CHECK-LABEL: define dso_local void @escape( // CHECK-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !pcsections [[META6:![0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] @@ -21,7 +21,7 @@ __attribute__((noinline, not_tail_called)) void escape(const volatile void *p) { sink = p; } -// CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) +// CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) // CHECK-LABEL: define dso_local i32 @normal_function( // CHECK-SAME: ptr noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] !pcsections [[META8:![0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] @@ -38,7 +38,7 @@ int normal_function(int *x, int *y) { return *y; } -// CHECK: Function Attrs: disable_sanitizer_instrumentation mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) +// CHECK: Function Attrs: disable_sanitizer_instrumentation mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) // CHECK-LABEL: define dso_local i32 @test_disable_sanitize_instrumentation( // CHECK-SAME: ptr noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] @@ -55,7 +55,7 @@ __attribute__((disable_sanitizer_instrumentation)) int test_disable_sanitize_ins return *y; } -// CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) +// CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) // CHECK-LABEL: define dso_local i32 @test_no_sanitize_thread( // CHECK-SAME: ptr noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] !pcsections [[META14:![0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] @@ -72,7 +72,7 @@ __attribute__((no_sanitize("thread"))) int test_no_sanitize_thread(int *x, int * return *y; } -// CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) +// CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) // CHECK-LABEL: define dso_local i32 @test_no_sanitize_all( // CHECK-SAME: ptr noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] !pcsections [[META14]] { // CHECK-NEXT: [[ENTRY:.*:]] @@ -89,10 +89,10 @@ __attribute__((no_sanitize("all"))) int test_no_sanitize_all(int *x, int *y) { return *y; } //. -// CHECK: attributes #[[ATTR0]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } -// CHECK: attributes #[[ATTR1]] = { mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } -// CHECK: attributes #[[ATTR2]] = { disable_sanitizer_instrumentation mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } -// CHECK: attributes #[[ATTR3]] = { mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "no_sanitize_thread" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } +// CHECK: attributes #[[ATTR0]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } +// CHECK: attributes #[[ATTR1]] = { mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } +// CHECK: attributes #[[ATTR2]] = { disable_sanitizer_instrumentation mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } +// CHECK: attributes #[[ATTR3]] = { mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "no_sanitize_thread" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } // CHECK: attributes #[[ATTR4:[0-9]+]] = { nounwind "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } //. // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} diff --git a/clang/test/CodeGenOpenCL/convergent.cl b/clang/test/CodeGenOpenCL/convergent.cl index 53a35a4f73119..99d9ee74e669b 100644 --- a/clang/test/CodeGenOpenCL/convergent.cl +++ b/clang/test/CodeGenOpenCL/convergent.cl @@ -133,7 +133,7 @@ kernel void assume_convergent_asm() __asm__ volatile("s_barrier"); } -// CHECK: attributes #0 = { nofree noinline norecurse nounwind " +// CHECK: attributes #0 = { nofree noinline norecurse nounwind memory(readwrite, target_mem0: none, target_mem1: none) " // CHECK: attributes #1 = { {{[^}]*}}convergent{{[^}]*}} } // CHECK: attributes #2 = { {{[^}]*}}convergent{{[^}]*}} } // CHECK: attributes #3 = { {{[^}]*}}convergent noduplicate{{[^}]*}} } diff --git a/clang/test/Driver/amdgpu-openmp-sanitize-options.c b/clang/test/Driver/amdgpu-openmp-sanitize-options.c index 10d64984918e6..fd7d11803249c 100644 --- a/clang/test/Driver/amdgpu-openmp-sanitize-options.c +++ b/clang/test/Driver/amdgpu-openmp-sanitize-options.c @@ -52,6 +52,48 @@ // RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ --offload-arch=gfx900:xnack+ -fsanitize=address -fno-gpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ // RUN: | FileCheck -check-prefixes=HOSTSAN,NOGPUSAN,SAN %s +// Catch invalid combination of sanitizers regardless of their order and ignore +// them selectively. +// (The address sanitizer enables the device sanitizer pipeline. The fuzzer +// implicitly turns on LLVMs SanitizerCoverage, which the driver then forwards +// to the device cc1. SanitizerCoverage is not supported on amdgcn.) + +// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=address,fuzzer --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=HOSTSANCOMBINATION,INVALIDCOMBINATION1 %s +// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=fuzzer,address --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=HOSTSANCOMBINATION,INVALIDCOMBINATION2 %s + +// Do the same for multiple -fsanitize arguments and multi-arch scenarios. + +// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ --offload-arch=gfx900:xnack- -fsanitize=address,fuzzer --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=HOSTSANCOMBINATION,INVALIDCOMBINATION1 %s +// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+,gfx900:xnack- -fsanitize=address,fuzzer --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=HOSTSANCOMBINATION,INVALIDCOMBINATION1 %s +// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+,gfx900:xnack- -fsanitize=fuzzer,address -fsanitize=leak --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=HOSTSANCOMBINATION2,NOTSUPPORTED-DAG,INVALIDCOMBINATION2 %s + +// Check for -fsanitize-coverage options +// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=address -fsanitize-coverage=inline-bool-flag --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=WARNSANCOV %s + +// Test -Xarch_device error scenario + +// RUN: not %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -Xarch_device -fsanitize=leak --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=UNSUPPORTEDERROR %s + +// RUN: not %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack- -Xarch_device -fsanitize=address --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=XNACKERROR %s + +// RUN: not %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -Xarch_device -fsanitize=fuzzer,address --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=INVALIDCOMBINATIONERROR %s + +// RUN: not %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=address -Xarch_device -fsanitize-coverage-stack-depth-callback-min=42 --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=ERRSANCOV %s + + +// INVALIDCOMBINATION1: warning: ignoring 'fuzzer' in '-fsanitize=address,fuzzer' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] +// INVALIDCOMBINATION2: warning: ignoring 'fuzzer' in '-fsanitize=fuzzer,address' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] + // FAIL-DAG: error: cannot find ROCm device library for ABI version 5; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library // NOTSUPPORTED-DAG: warning: ignoring '-fsanitize=leak' option as it is not currently supported for target 'amdgcn-amd-amdhsa' @@ -59,6 +101,8 @@ // XNACKNEG: warning: ignoring '-fsanitize=address' option for offload arch 'gfx908:xnack-' as it is not currently supported there. Use it with an offload arch containing 'xnack+' instead // HOSTSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "--offload-targets=amdgcn-amd-amdhsa".* "-x" "c".*}} +// HOSTSANCOMBINATION: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address,fuzzer,fuzzer-no-link".* "--offload-targets=amdgcn-amd-amdhsa".* "-x" "c".*}} +// HOSTSANCOMBINATION2: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address,fuzzer,fuzzer-no-link,leak".* "--offload-targets=amdgcn-amd-amdhsa".* "-x" "c".*}} // GPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu".* "-emit-llvm-bc".* "-mlink-bitcode-file" "[^"]*asanrtl.bc".* "-mlink-bitcode-file" "[^"]*ockl.bc".* "-target-cpu" "(gfx908|gfx900|gfx1250|gfx1251)".* "-fopenmp".* "-fsanitize=address".* "-x" "c".*}} // NOGPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu".* "-emit-llvm-bc".* "-target-cpu" "(gfx908|gfx900)".* "-fopenmp".* "-x" "c".*}} @@ -66,3 +110,10 @@ // SAN: {{"[^"]*llvm-offload-binary[^"]*" "-o".* "--image=file=.*.bc,triple=amdgcn-amd-amdhsa,arch=(gfx908|gfx1250|gfx1251)(:xnack\-|:xnack\+)?,kind=openmp(,feature=(\-xnack|\+xnack))?"}} // SAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "--offload-targets=amdgcn-amd-amdhsa".* "-x" "ir".*}} // SAN: {{"[^"]*clang-linker-wrapper[^"]*".* "--host-triple=x86_64-unknown-linux-gnu".* "--linker-path=[^"]*".* "--whole-archive" "[^"]*(libclang_rt.asan_static.a|libclang_rt.asan_static-x86_64.a)".* "--whole-archive" "[^"]*(libclang_rt.asan.a|libclang_rt.asan-x86_64.a)".*}} + +// UNSUPPORTEDERROR: error: '-fsanitize=leak' option is not currently supported for target 'amdgcn-amd-amdhsa' +// XNACKERROR: error: '-fsanitize=address' option for offload arch 'gfx908:xnack-' is not currently supported there. Use it with an offload arch containing 'xnack+' instead +// INVALIDCOMBINATIONERROR: error: 'fuzzer' in '-fsanitize=fuzzer,address' option is not currently supported for target 'amdgcn-amd-amdhsa' + +// WARNSANCOV: warning: ignoring '-fsanitize-coverage=inline-bool-flag' option as it is not currently supported for target 'amdgcn-amd-amdhsa' +// ERRSANCOV: error: '-fsanitize-coverage-stack-depth-callback-min=42' option is not currently supported for target 'amdgcn-amd-amdhsa' diff --git a/clang/test/Driver/hip-sanitize-options.hip b/clang/test/Driver/hip-sanitize-options.hip index 490385173a4cb..d436756ee046b 100644 --- a/clang/test/Driver/hip-sanitize-options.hip +++ b/clang/test/Driver/hip-sanitize-options.hip @@ -52,6 +52,51 @@ // RUN: -fsanitize=leak -nogpuinc --rocm-path=%S/Inputs/rocm \ // RUN: %s 2>&1 | FileCheck -check-prefixes=NOGPUNEG %s +// Catch invalid combination of sanitizers regardless of their order and ignore +// them selectively. +// (The address sanitizer enables the device sanitizer pipeline. The fuzzer +// implicitly turns on LLVMs SanitizerCoverage, which the driver then forwards +// to the device cc1. SanitizerCoverage is not supported on amdgcn.) + +// RUN: %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+ \ +// RUN: -fsanitize=address,fuzzer --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=INVALIDCOMBINATION,INVALIDCOMBINATION1 %s +// RUN: %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+ \ +// RUN: -fsanitize=fuzzer,address --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=INVALIDCOMBINATION,INVALIDCOMBINATION2 %s + +// Do the same for multiple -fsanitize arguments and multi-arch scenarios. + +// RUN: %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+ --offload-arch=gfx908:xnack- \ +// RUN: -fsanitize=address,fuzzer -fsanitize=leak --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=MULT1,XNACK2 %s +// RUN: %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+,gfx908:xnack- \ +// RUN: -fsanitize=fuzzer,address -fsanitize=leak --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=MULT2,XNACK2 %s + +// Check for -fsanitize-coverage options +// RUN: %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+ \ +// RUN: -fsanitize=address -fsanitize-coverage=inline-bool-flag --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=WARNSANCOV %s + +// Test -Xarch_device error scenario + +// RUN: not %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+ \ +// RUN: -Xarch_device -fsanitize=leak --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=UNSUPPORTEDERROR %s + +// RUN: not %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack- \ +// RUN: -Xarch_device -fsanitize=address --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=XNACKERROR %s + +// RUN: not %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+ \ +// RUN: -Xarch_device -fsanitize=fuzzer,address --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=INVALIDCOMBINATIONERROR %s + +// RUN: not %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+ \ +// RUN: -fsanitize=address -Xarch_device -fsanitize-coverage-stack-depth-callback-min=42 --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=ERRSANCOV %s + // CHECK-NOT: {{"[^"]*clang[^"]*".* "-fcuda-is-device".* "-fsanitize=address"}} // CHECK-NOT: {{"[^"]*clang[^"]*".* "-fcuda-is-device".* "-mlink-bitcode-file" ".*asanrtl.bc"}} // CHECK-NOT: {{"[^"]*lld(\.exe){0,1}".* ".*hip.bc"}} @@ -101,3 +146,31 @@ // NOGPUNEG-NOT: {{"[^"]*clang[^"]*".* "-mlink-bitcode-file" ".*asanrtl.bc".* "-target-cpu" "gfx900".* "-target-feature" "-xnack"}} // NOGPUNEG-NOT: {{"[^"]*clang[^"]*".* "-mlink-bitcode-file" ".*asanrtl.bc".* "-target-cpu" "gfx906"}} // NOGPUNEG-NOT: {{"[^"]*lld(\.exe){0,1}".* ".*hip.bc"}} + +// INVALIDCOMBINATION1-DAG: warning: ignoring 'fuzzer' in '-fsanitize=address,fuzzer' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] +// INVALIDCOMBINATION2-DAG: warning: ignoring 'fuzzer' in '-fsanitize=fuzzer,address' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] +// INVALIDCOMBINATION-DAG: {{"[^"]*clang[^"]*".* "-mlink-bitcode-file" ".*asanrtl.bc".* "-target-cpu" "gfx900".* "-target-feature" "\+xnack".* "-fsanitize=address"}} +// INVALIDCOMBINATION-DAG: {{"[^"]*clang[^"]*".* "-triple" "x86_64-unknown-linux-gnu".* "-fsanitize=address,fuzzer,fuzzer-no-link"}} + +// MULT1-DAG: warning: ignoring 'fuzzer' in '-fsanitize=address,fuzzer' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] +// MULT1-DAG: warning: ignoring '-fsanitize=leak' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] +// MULT1-DAG: warning: ignoring 'fuzzer' in '-fsanitize=address,fuzzer' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] +// MULT1-DAG: warning: ignoring '-fsanitize=address,fuzzer' option for offload arch 'gfx908:xnack-' as it is not currently supported there. Use it with an offload arch containing 'xnack+' instead [-Woption-ignored] +// MULT1-DAG: warning: ignoring '-fsanitize=leak' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] + +// MULT2-DAG: warning: ignoring 'fuzzer' in '-fsanitize=fuzzer,address' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] +// MULT2-DAG: warning: ignoring '-fsanitize=leak' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] +// MULT2-DAG: warning: ignoring 'fuzzer' in '-fsanitize=fuzzer,address' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] +// MULT2-DAG: warning: ignoring '-fsanitize=fuzzer,address' option for offload arch 'gfx908:xnack-' as it is not currently supported there. Use it with an offload arch containing 'xnack+' instead [-Woption-ignored] +// MULT2-DAG: warning: ignoring '-fsanitize=leak' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] + +// XNACK2-DAG: {{"[^"]*clang[^"]*".* "-mlink-bitcode-file" ".*asanrtl.bc".* "-target-cpu" "gfx900".* "-target-feature" "\+xnack".* "-fsanitize=address"}} +// XNACK2-DAG: {{"[^"]*clang[^"]*".* "-target-cpu" "gfx908"}} +// XNACK2-DAG: {{"[^"]*clang[^"]*".* "-triple" "x86_64-unknown-linux-gnu".* "-fsanitize=address,fuzzer,fuzzer-no-link,leak"}} + +// UNSUPPORTEDERROR: error: '-fsanitize=leak' option is not currently supported for target 'amdgcn-amd-amdhsa' +// XNACKERROR: error: '-fsanitize=address' option for offload arch 'gfx900:xnack-' is not currently supported there. Use it with an offload arch containing 'xnack+' instead +// INVALIDCOMBINATIONERROR: error: 'fuzzer' in '-fsanitize=fuzzer,address' option is not currently supported for target 'amdgcn-amd-amdhsa' + +// WARNSANCOV: warning: ignoring '-fsanitize-coverage=inline-bool-flag' option as it is not currently supported for target 'amdgcn-amd-amdhsa' +// ERRSANCOV: error: '-fsanitize-coverage-stack-depth-callback-min=42' option is not currently supported for target 'amdgcn-amd-amdhsa' diff --git a/compiler-rt/cmake/Modules/CheckAssemblerFlag.cmake b/compiler-rt/cmake/Modules/CheckAssemblerFlag.cmake new file mode 100644 index 0000000000000..ace80ce5583c7 --- /dev/null +++ b/compiler-rt/cmake/Modules/CheckAssemblerFlag.cmake @@ -0,0 +1,39 @@ +# Helper function to find out whether the assembler supports a particular +# command-line flag. You'd like to use the standard check_compiler_flag(), but +# that only supports a fixed list of languages, and ASM isn't one of them. So +# we do it ourselves, by trying to assemble an empty source file. + +function(check_assembler_flag outvar flag) + if(NOT DEFINED "${outvar}") + if(NOT CMAKE_REQUIRED_QUIET) + message(CHECK_START "Checking for assembler flag ${flag}") + endif() + + # Stop try_compile from attempting to link the result of the assembly, so + # that we don't depend on having a working linker, and also don't have to + # figure out what special symbol like _start needs to be defined in the + # test input. + # + # This change is made within the dynamic scope of this function, so + # CMAKE_TRY_COMPILE_TARGET_TYPE will be restored to its previous value on + # return. + set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) + + # Try to assemble an empty file with a .S name, using the provided flag. + set(asm_source_file + ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CheckAssemblerFlag.S) + write_file(${asm_source_file} "") + try_compile(${outvar} + ${CMAKE_BINARY_DIR} + SOURCES ${asm_source_file} + COMPILE_DEFINITIONS ${flag}) + + if(NOT CMAKE_REQUIRED_QUIET) + if(${outvar}) + message(CHECK_PASS "Accepted") + else() + message(CHECK_FAIL "Not accepted") + endif() + endif() + endif() +endfunction() diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index 0521df1a70961..43abc79624773 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -60,6 +60,7 @@ endif() include(builtin-config-ix) include(CMakeDependentOption) include(CMakePushCheckState) +include(CheckAssemblerFlag) option(COMPILER_RT_BUILTINS_HIDE_SYMBOLS "Do not export any symbols from the static library." ON) @@ -423,6 +424,40 @@ set(arm_or_thumb2_base_SOURCES ${GENERIC_SOURCES} ) +option(COMPILER_RT_ARM_OPTIMIZED_FP + "On 32-bit Arm, use optimized assembly implementations of FP arithmetic. Likely to increase code size, but be faster." ON) + +set(arm_or_thumb2_optimized_fp_SOURCES) +if(COMPILER_RT_ARM_OPTIMIZED_FP AND BUILTIN_SUPPORTED_ARCH MATCHES "arm") + check_assembler_flag(COMPILER_RT_HAS_MIMPLICIT_IT -mimplicit-it=always) + if(COMPILER_RT_HAS_MIMPLICIT_IT) + set(implicit_it_flag -mimplicit-it=always) + else() + check_assembler_flag( + COMPILER_RT_HAS_WA_MIMPLICIT_IT -Wa,-mimplicit-it=always) + if(COMPILER_RT_HAS_WA_MIMPLICIT_IT) + set(implicit_it_flag -Wa,-mimplicit-it=always) + else() + message(WARNING "Don't know how to set the -mimplicit-it=always flag in this assembler; not including Arm optimized implementations") + set(implicit_it_flag "") + endif() + endif() + + if(implicit_it_flag) + set(assembly_files + arm/mulsf3.S + arm/divsf3.S) + set_source_files_properties(${assembly_files} + PROPERTIES COMPILE_OPTIONS ${implicit_it_flag}) + set(arm_or_thumb2_optimized_fp_SOURCES + ${assembly_files} + arm/fnan2.c + arm/fnorm2.c + arm/funder.c + ) + endif() +endif() + set(arm_sync_SOURCES arm/sync_fetch_and_add_4.S arm/sync_fetch_and_add_8.S @@ -456,6 +491,16 @@ set(thumb1_base_SOURCES ${GENERIC_SOURCES} ) +if(COMPILER_RT_ARM_OPTIMIZED_FP) + set(thumb1_base_SOURCES + arm/thumb1/mulsf3.S + arm/fnan2.c + arm/fnorm2.c + arm/funder.c + ${thumb1_base_SOURCES} + ) +endif() + set(arm_EABI_RT_SOURCES arm/aeabi_cdcmp.S arm/aeabi_cdcmpeq_check_nan.c @@ -567,6 +612,7 @@ if(MINGW) arm/aeabi_uldivmod.S arm/chkstk.S ${arm_or_thumb2_base_SOURCES} + ${arm_or_thumb2_optimized_fp_SOURCES} ${arm_sync_SOURCES} ) @@ -577,6 +623,7 @@ elseif(NOT WIN32) # TODO the EABI sources should only be added to EABI targets set(arm_SOURCES ${arm_or_thumb2_base_SOURCES} + ${arm_or_thumb2_optimized_fp_SOURCES} ${arm_sync_SOURCES} ${arm_EABI_SOURCES} ${arm_Thumb1_SOURCES} diff --git a/compiler-rt/lib/builtins/arm/divsf3.S b/compiler-rt/lib/builtins/arm/divsf3.S new file mode 100644 index 0000000000000..faabd8225c344 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/divsf3.S @@ -0,0 +1,618 @@ +//===-- divsf3.S - single-precision floating point division ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements single-precision soft-float division with the IEEE-754 +// default rounding (to nearest, ties to even), in optimized AArch32 assembly +// language suitable to be built as either Arm or Thumb2. +// +//===----------------------------------------------------------------------===// + +#include "../assembly.h" + + + .syntax unified + .text + .p2align 2 + +#if __ARM_PCS_VFP +DEFINE_COMPILERRT_FUNCTION(__divsf3) + push {r4, lr} + vmov r0, s0 + vmov r1, s1 + bl __aeabi_fdiv + vmov s0, r0 + pop {r4, pc} +#else +DEFINE_COMPILERRT_FUNCTION_ALIAS(__divsf3, __aeabi_fdiv) +#endif + +DEFINE_COMPILERRT_FUNCTION(__aeabi_fdiv) + // Extract the exponents of the inputs into r2 and r3, occupying bits 16-23 + // of each register so that there will be space lower down to store extra + // data without exponent arithmetic carrying into it. In the process, check + // both exponents for 00 or FF and branch out of line to handle all the + // uncommon types of value (infinity, NaN, zero, denormals). + // + // Chaining conditional instructions like this means that the second + // instruction (setting up r3) might not be executed at all, so fdiv_uncommon + // will have to redo it just in case. That saves an instruction here, + // executed for _all_ inputs, and moves it to the uncommon path run for only + // some inputs. + mov r12, #0xFF0000 + ands r2, r12, r0, lsr #7 // r2 has exponent of numerator. (Is it 0?) + andsne r3, r12, r1, lsr #7 // r3 has exponent of denominator. (Is it 0?) + teqne r2, r12 // if neither was 0, is one FF? + teqne r3, r12 // or the other? + beq LOCAL_LABEL(uncommon) // branch out of line if any answer was yes + + // Calculate the output sign, which is always just the XOR of the input + // signs. Store it in bit 8 of r2, below the numerator exponent. + teq r0, r1 // is the output sign bit 1? + orrmi r2, r2, #0x100 // if so, set bit 8 of r2 + + // Isolate the mantissas of both values, by setting bit 23 of each one and + // clearing the 8 bits above that. + // + // In the process, swap the register allocations (which doesn't cost extra + // instructions if we do it as part of this manipulation). We want the + // numerator not to be in r0, because r0 is where we'll build up the quotient + // while subtracting things from the numerator. + orr r12, r0, #1 << 23 + orr r0, r1, #1 << 23 + bic r1, r12, #0xFF000000 + bic r0, r0, #0xFF000000 + +LOCAL_LABEL(div): + // Start of the main division. We get here knowing that: + // + // r0 = mantissa of denominator, with the leading 1 at bit 23 + // r1 = mantissa of numerator, similarly + // r2 = (exponent of numerator << 16) + (result sign << 8) + // r3 = (exponent of denominator << 16) + + push {r14} // we'll need an extra register + + // Calculate the initial result exponent by just subtracting the two input + // exponents. This doesn't affect the sign bit lower down in r2. + sub r2, r2, r3 + + // That initial exponent might need to be adjusted by 1, depending on whether + // dividing the mantissas gives a value >=1 or <1. We don't need to wait + // until the division is finished to work that out: we can tell immediately + // by just comparing the mantissas. + // + // The basic idea is to do the comparison in a way that sets the C flag if + // numerator >= denominator. Then we recombine the sign and exponent by doing + // "ADC r2, r2, r2, asr #16": the exponent in the top half of r2 is shifted + // down to the low 8 bits, just below the sign bit, and using ADC rather than + // ADD folds in the conditional increment from the mantissa comparison. + // + // If we're not incrementing the output exponent, we instead shift the + // numerator mantissa left by 1, so that it _is_ greater than the denominator + // mantissa. Otherwise we'd generate only a 22-bit quotient, instead of 23. + // + // The exponent also needs to be rebiased, so that dividing two numbers the + // same gives an output exponent of 0x7F. If the two inputs have the same + // exponent then we'll have computed an exponent of 0 via the SUB instruction + // above; if the mantissas are the same as well then the ADC will increment + // it; also, the leading bit of the quotient will increment the exponent + // again when we recombine it with the output mantissa later. So we need to + // add (0x7F - 2) to the mantissa now, to make an exponent of 0 from the SUB + // come to 0x7F after both of those increments. + // + // Putting all of that together, what we _want_ to do is this: + // + // [#1] CMP r1, r0 // set C if num >= den + // [#2] MOVLO r1, r1, lsl #1 // if num < den, shift num left + // [#3] ADD r2, r2, #0x7D0000 // rebias exponent + // [#4] ADC r2, r2, r2, asr #16 // combine sign + exp + adjustment + // + // However, we only do the first of those four instructions right here. The + // other three are distributed through the code below, after unrelated load + // or multiply instructions which will have a result delay slot on simple + // CPUs. Each is labelled "exponent setup [#n]" in a comment. + // + // (Since instruction #4 depends on the flags set up by #2, we must avoid + // clobbering the flags in _any_ of the instructions interleaved with this!) + cmp r1, r0 // exponent setup [#1] + + // Start the mantissa division by making an approximation to the reciprocal + // of the denominator. We first obtain an 8-bit approximation using a table + // lookup indexed by the top 7 denominator bits (counting the leading 1, so + // really there are only 6 bits in the table index). + // + // (r0 >> 17) is the table index, and its top bit is always set, so it ranges + // from 64 to 127 inclusive. So we point the base register 64 bytes before + // the actual table. + adr r12, LOCAL_LABEL(tab) - 64 +#if __thumb__ + // Thumb can't do this particular shift+add+load in one instruction - it only + // supports left shifts of 0 to 3 bits, not right shifts of 17. So we must + // calculate the load offset separately. + add r14, r12, r0, lsr #17 + ldrb r14, [r14] +#else + ldrb r14, [r12, r0, lsr #17] +#endif + + // Now do an iteration of Newton-Raphson to improve that 8-bit approximation + // to have 15-16 accurate bits. + // + // Basics of Newton-Raphson for finding a reciprocal: if you want to find 1/d + // and you have some approximation x, your next approximation is X = x(2-dx). + // Looked at one way, this is the result of applying the N-R formula + // X=x-f(x)/f'(x) to the function f(x) = 1/x - d. Another way to look at it + // is to suppose that dx = 1 - e, for some e which is small (because dx is + // already reasonably close to 1). Then you want to double the number of + // correct bits in the next approximation, i.e. square the error. So you want + // dX = 1-e^2 = (1-e)(1+e) = dx(2-dx). Cancelling d gives X = x(2-dx) again. + // + // In this situation, we're working in fixed-point integers rather than real + // numbers, and all the scales are different: + // * our input denominator d is in the range [2^23,2^24) + // * our input approximation x is in the range [2^7,2^8) + // * we want the output approximation to be in the range [2^15,2^16) + // Those factors combine to mean that we want + // x(2^32-dx) / 2^23 + // = (2^9 x) - (dx^2 / 2^23) + // + // But we also want to compute this using ordinary MUL, not a long multiply + // instruction (those are slower). So we need to worry about the product + // overflowing. dx fits in 32 bits, because it's the product of something + // <2^24 with something <2^8; but we must shift it right before multiplying + // by x again. + + mul r12, r0, r14 // r12 = dx + movlo r1, r1, lsl #1 // exponent setup [#2] in the MUL delay slot + mvn r12, r12, lsr #8 // r12 ~= -dx/2^8 + mul r3, r12, r14 // r3 ~= -dx^2/2^8 + mov r14, r14, lsl #9 // r14 = 2^9 x + add r14, r14, r3, asr #15 // r14 ~= 2^9 x - dx^2 / 2^23 + + // Now r14 is a 16-bit approximation to the reciprocal of the input mantissa, + // scaled by 2^39 (so that the min mantissa 2^23 would have reciprocal 2^16 + // in principle, and the max mantissa 2^24-1 would have reciprocal just over + // 2^15). The error is always negative (r14 is an underestimate of the true + // value), and the maximum error is 6 and a bit ULP (that is, the true + // reciprocal is strictly less than (r14+7)). Also, r14 is always strictly + // less than 0x10000 (even in the case of the min mantissa, where the true + // value would be _exactly_ 0x10000), which eliminates a case of integer + // overflow. + // + // All of these properties of the reciprocal approximation are checked by + // exhaustively iterating over all 2^23 possible input mantissas. (The nice + // thing about doing this in single rather than double precision!) + // + // Now we extract most of the quotient by two steps of long division, using + // the reciprocal estimate to identify a multiple of the denominator to + // subtract from the numerator. To avoid integer overflow, the numerator + // mantissa is shifted down 8 bits so that it's less than 0x10000. After we + // calculate an approximate quotient, we shift the numerator left and + // subtract that multiple of the denominator, moving the next portion of the + // numerator into range for the next iteration. + + // First iteration of long division. We shift the numerator left 11 bits, and + // since the quotient approximation is scaled by 2^31, we must shift that + // right by 20 to make the right product to subtract from the numerator. + mov r12, r1, lsr #8 // shift the numerator down + mul r12, r14, r12 // make the quotient approximation + mov r1, r1, lsl #11 // shift numerator left, ready for subtraction + mov r3, r12, lsr #20 // make first 12-bit block of quotient bits + mls r1, r0, r3, r1 // subtract that multiple of den from num + + add r2, r2, #0x7D0000 // exponent setup [#3] in the MLS delay slot + + // Second iteration of long division. Differences from the first step: this + // time we shift the numerator 12 bits instead of 11, so that the total of + // both steps is 23 bits, i.e. we've shifted up by exactly the full width of + // the output mantissa. Also, the block of output quotient bits is left in a + // different register: it was in r3 the first time, and this time it's in + // r12, so that we still have both available at the end of the process. + mov r12, r1, lsr #8 // shift the numerator down + mul r12, r14, r12 // make the quotient approximation + mov r1, r1, lsl #12 // shift numerator left, ready for subtraction + mov r12, r12, lsr #19 // make second 11-bit block of quotient + mls r1, r0, r12, r1 // subtract that multiple of den from num + + adc r2, r2, r2, asr #16 // exponent setup [#4] in the MLS delay slot + + // Now r1 contains the original numerator, shifted left 23, minus _some_ + // multiple of the original denominator (which is still in r0). The bounds on + // the error in the above steps should make the error at most 1: that is, we + // may have to subtract the denominator one more time to make r1 < r0, and + // increment the quotient by one more. + // + // Our quotient is still in two pieces, computed separately in the above long + // division steps. We fold the final increment into the same instruction that + // recombines them, by doing the comparison in such a way that it sets the + // carry flag if the increment is needed. + + cmp r1, r0 // Set carry flag if num >= den + subhs r1, r1, r0 // If so, subtract den from num + adc r3, r12, r3, lsl #12 // Recombine quotient halves, plus optional +1 + + // We've finished with r14 as a temporary register, so we can unstack it now. + pop {r14} + + // Now r3 contains the _rounded-down_ output quotient, and r1 contains the + // remainder. That is, (denominator * r3 + r1) = (numerator << 23), and + // 0 <= r1 < denominator. + // + // Next we must round to nearest, by checking if r1 is greater than half the + // denominator. In division, it's not possible to hit an exact round-to-even + // halfway case, so we don't need to spend any time checking for it. + // + // Proof of no round-to-even: define the 'width' of a dyadic rational to be + // the distance between the lowest and highest 1 bits in its binary + // representation, or equivalently, the index of its high bit if you scale it + // by a power of 2 to make it an odd integer. E.g. any actual power of 2 has + // width 0, and all of 0b11110, 0b1111, 0b11.11 and 0b0.01111 have width 3. + // Then for any dyadic rationals a,b, width(ab) >= width(a)+width(b). Let w + // be the maximum width that the input precision supports (so that for single + // precision, w=23). Then if some division n/d were a round-to-even case, the + // true quotient q=n/d would have width exactly w+1. But we have qd=n, so + // width(n) >= width(q)+width(d) > w, which can't happen, because n is in the + // input precision, hence had width <= w.) + // + // So we don't need to check for an exact _halfway_ case and clear the low + // bit of the quotient after rounding up, as addition and multiplication both + // need to do. But we do need to remember if the quotient itself was exact, + // that is, if there was no remainder at all. That's needed in underflow + // handling. + + // The rounding check wants to compare remainder with denominator/2. But of + // course in integers it's easier to compare 2*remainder with denominator. So + // we start by shifting the remainder left by 1, and in the process, set Z if + // it's exactly 0 (i.e. the result needs no rounding at all). + lsls r1, r1, #1 + // Now trial-subtract the denominator. We don't do this at all if the result + // was exact. If we do do it, r1 goes negative precisely if we need to round + // up, which sets the C flag. (The previous instruction will have left C + // clear, since r1 had its top 8 bits all clear. So now C is set _only_ if + // we're rounding up.) + subsne r1, r1, r0 + // Recombine the quotient with the sign + exponent, and use the C flag from + // the previous instruction to increment the quotient if we're rounding up. + adc r0, r3, r2, lsl #23 + + // If we haven't either overflowed or underflowed, we're done. We can + // identify most of the safe cases by doing an unsigned comparison of the + // initial output exponent (in the top half of r2) with 0xFC: if 0 <= r2 < + // 0xFC0000 then we have neither underflow nor overflow. + // + // Rationale: the value in the top half of r2 had three chances to be + // incremented before becoming the exponent field of the actual output float. + // It was incremented if we found the numerator mantissa was >= the + // denominator (producing the value in the _bottom_ half of r2, which we just + // ADCed into the output). Then it gets unconditionally incremented again + // when the ADC combines it with the leading mantissa bit. And finally, + // round-up might increment it a third time. So 0xFC is the smallest value + // that can possibly turn into the overflowed value 0xFF after all those + // increments. + // + // On the underflow side, (top half of r2) = 0 corresponds to a value of 1 in + // the final result's exponent field (and then rounding might increase it + // further); if the exponent was less than that then r2 wraps round and looks + // like a very large positive integer from the point of view of this unsigned + // comparison. + cmp r2, #0xFC0000 + bxlo lr + + // The same comparison will have set the N and V flags to reflect the result + // of comparing r2 with 0xFC0000 as a _signed_ integer. That reliably + // distinguishes potential underflow (r2 is negative) from potential overflow + // (r2 is positive and at least 0xFC0000) + bge LOCAL_LABEL(overflow) + + // Here we might or might not have underflow (but we know we don't have + // overflow). To check more carefully, we look at the _bottom_ half of r2, + // which contains the exponent after the first adjustment (for num >= denom), + // That is, it's still off by 1 (compensating for the leading quotient bit), + // and is also before rounding. + // + // We neglect the effect of rounding: division results that are tiny (less + // than the smallest normalised number) before rounding, but then round up to + // the smallest normal number, are an acceptable edge case to handle slowly. + // We pass those to funder without worrying about them. + // + // So we want to check whether the bottom half of r2 was negative. It would + // be nice to check bits 8-15 of it, but unfortunately, it's already been + // combined with the sign (at bit 8), so those bits don't tell us anything + // useful. Instead we look at the top 4 bits of the exponent field, i.e. the + // 0xF0 bits. The largest _non_-overflowing exponent that might reach here is + // less than 3, so it doesn't reach those bits; the smallest possible + // underflow, obtained by dividing the smallest denormal by the largest + // finite number, is -151 (before the leading bit increments it), which will + // set the low 8 bits of r2 to 0x69. That is, the 0xF0 nibble of r2 will be + // 0x60 or greater for a (pre-rounding) underflow, and zero for a + // non-underflow. + + tst r2, #0xF0 + bxeq lr // no underflow after all; return + + // Rebias the exponent for funder, which also corrects the sign bit. + add r0, r0, #192 << 23 + // Tell funder whether the true value is greater or less than the number in + // r0. This is obtained from the sign of the remainder (still in r1), with + // the only problem being that it's currently reversed. So negate r1 (leaving + // 0 at 0 to indicate exactness). + rsbs r1, r1, #0 + b SYMBOL_NAME(__compiler_rt_funder) + +LOCAL_LABEL(overflow): + // Here we might or might not have overflow (but we know we don't have + // underflow). We must check whether we really have overflowed. + // + // For this it's easiest to check the exponent field in the actual output + // value in r0, after _all_ the adjustments have been completed. The largest + // overflowed exponent is 0x193, and the smallest exponent that can reach + // this is 0xFD (we checked against 0xFC above, but then the leading quotient + // bit incremented it). So it's enough to shift the output left by one + // (moving the exponent field to the top), increment it once more (so that + // the smallest overflowed exponent 0xFF wraps round to 0), and then compare + // against 0xFE000000 as an unsigned integer. + mov r12, r0, lsl #1 + add r12, r12, #1 << 24 + cmp r12, #0xFE << 24 // Check for exp = 253 or 254 + bxhs lr + // We have actual overflow. Rebias r0 to bring the exponent back into range, + // which ensures its sign is correct. Then make an infinity of that sign to + // return. + subs r0, r0, #0xC0 << 23 + movs r12, #0xFF // exponent of infinity + orrs r12, r12, r0, lsr #23 // exponent and sign at bottom of r12 + movs r0, r12, lsl #23 // shift it up to the top of r0 to return + bx lr + +LOCAL_LABEL(uncommon): + // We come here from the start of the function if either input is an uncommon + // value: zero, denormal, infinity or NaN. + // + // We arrive here with r12 = 0xFF000000, and r2 containing the exponent of x + // in bits 16..23. But r3 doesn't necessarily contain the exponent of y, + // because the instruction that set it up was conditional. So first we + // unconditionally repeat it. + and r3, r12, r1, lsr #7 + + // In all cases not involving a NaN as output, the sign of the output is made + // in the same way as for finite numbers, as the XOR of the input signs. So + // repeat the sign setup from the main branch. + teq r0, r1 // is the output sign bit 1? + orrmi r2, r2, #0x100 // if so, set bit 8 of r2 + + // Detect infinities and NaNs, by checking if either of r2 or r3 is at least + // 0xFF0000. + cmp r2, #0xFF0000 + cmplo r3, #0xFF0000 + bhs LOCAL_LABEL(inf_NaN) + + // Now we know there are no infinities or NaNs, but there's at least one zero + // or denormal. + movs r12, r1, lsl #1 // is y zero? + beq LOCAL_LABEL(divbyzero) // if so, go and handle division by zero + movs r12, r0, lsl #1 // is x zero? (now we know that y is not) + moveq r0, r2, lsl #23 // if so, 0/nonzero is just 0 (of right sign) + bxeq lr + + // Now we've eliminated zeroes as well, leaving only denormals: either x or + // y, or both, is a denormal. Call fnorm2 to convert both into a normalised + // mantissa and a (potentially small) exponent. + and r12, r2, #0x100 // save the result sign from r2 + lsr r2, #16 // shift extracted exponents down to bit 0 + lsr r3, #16 // where fnorm2 will expect them + push {r0, r1, r2, r3, r12, lr} + mov r0, sp // tell fnorm2 where to find its data + bl SYMBOL_NAME(__compiler_rt_fnorm2) + pop {r0, r1, r2, r3, r12, lr} + lsl r3, #16 // shift exponents back up to bit 16 + orr r2, r12, r2, lsl #16 // and put the result sign back in r2 + + // Now rejoin the main code path, having finished the setup it will expect: + // swap x and y, and shift the fractions back down to the low 24 bits. + mov r12, r0, lsr #8 + mov r0, r1, lsr #8 + mov r1, r12 + b LOCAL_LABEL(div) + +LOCAL_LABEL(inf_NaN): + // We come here if at least one input is a NaN or infinity. If either or both + // inputs are NaN then we hand off to fnan2 to propagate a NaN from the + // input. + mov r12, #0xFF000000 + cmp r12, r0, lsl #1 // if (r0 << 1) > 0xFF000000, r0 is a NaN + blo SYMBOL_NAME(__compiler_rt_fnan2) + cmp r12, r1, lsl #1 + blo SYMBOL_NAME(__compiler_rt_fnan2) + + // No NaNs, so we have three options: inf/inf = NaN, inf/finite = inf, and + // finite/inf = 0. + + // If both operands are infinity, we return a NaN. Since we know at + // least _one_ is infinity, we can test this by checking if they're + // equal apart from the sign bits. + eor r3, r0, r1 + lsls r3, #1 // were all bits of XOR zero other than top? + beq LOCAL_LABEL(invalid) // if so, both operands are infinity + + // See if x is infinite + cmp r12, r0, lsl #1 // (r0 << 1) == 0xFF000000? + beq LOCAL_LABEL(infret) // if so, infinity/finite = infinity + + // y is infinite and x is not, so we return a zero of the + // combined sign. + eor r0, r0, r1 // calculate the right sign + and r0, r0, #0x80000000 // throw away everything else + bx lr + +LOCAL_LABEL(divbyzero): + // Here, we know y is zero. But we don't know if x is zero or nonzero. So we + // might be calculating 0/0 (invalid operation, generating a NaN), or + // nonzero/0 (the IEEE "division by zero" exception, generating infinity). + movs r12, r0, lsl #1 // is x zero too? + beq LOCAL_LABEL(invalid) // if so, go and return a NaN + +LOCAL_LABEL(infret): + // Here, we're either dividing infinity by a finite number, or dividing a + // nonzero number by 0. (Or both, if we're dividing infinity by 0.) In all + // these cases we return infinity with the sign from r2. + // + // If we were implementing IEEE exceptions, we'd have to separate these + // cases: infinity / finite is not an _exception_, it just returns infinity, + // whereas (finite and nonzero) / 0 is a division-by-zero exception. But here + // we're not implementing exceptions, so we can treat all three cases the + // same. + // + // r2 contains the output sign in bit 8, which is a convenient place to find + // it when making an infinity, because we can fill in the 8 exponent bits + // below that and then shift it left. + orr r2, r2, #0xff // sign + maximum exponent + lsl r0, r2, #23 // shift up to the top + bx lr + +LOCAL_LABEL(invalid): + // Return the default NaN, from an invalid operation (either dividing + // infinity by infinity, or 0 by 0). + ldr r0, =0x7FC00000 + bx lr + +// Finally, the lookup table for the initial reciprocal approximation. +// +// The table index is made from the top 7 bits of the denominator mantissa. But +// the topmost bit is always 1, so only the other 6 bits vary. So it only has +// 64 entries, not 128. +// +// Each table entry is a single byte, with its top bit set. So the table +// entries correspond to the reciprocal of a 7-bit mantissa prefix scaled up by +// 2^14, or the reciprocal of a whole 24-bit mantissa scaled up by 2^31. +// +// Each of these 64 entries corresponds to a large interval of possible +// mantissas. For example, if the top 7 bits are 1000001 then the overall +// mantissa could be anything from 0x820000 to 0x83FFFF. And because the output +// of this table provides more bits than the input, there are several choices +// of 8-bit reciprocal approximation for a number in that interval. The +// reciprocal of 0x820000 starts with 0xFC plus a fraction, and the reciprocal +// of 0x83FFFF starts with 0xF9 minus a fraction, so there are four reasonable +// choices for that table entry: F9, FA, FB or FC. Which do we pick? +// +// The table below is generated by choosing whichever value minimises the +// maximum possible error _after_ the approximation is improved by the +// Newton-Raphson step. In the example above, we end up with FA. +// +// The Python code below will regenerate the table, complete with the per-entry +// comments. + +/* + +for prefix in range(64, 128): + best = None + + # Max and min 23-bit mantissas with this 7-bit prefix + mmin, mmax = prefix * 2**17, (prefix + 1) * 2**17 - 1 + + # Max and min table entry corresponding to the reciprocal of something in + # that range of mantissas: round up the reciprocal of mmax, and round down + # the reciprocal of mmin. Also clamp to the range [0x80,0xff], because + # 0x100 can't be used as a table entry due to not fitting in a byte, even + # though it's the exact reciprocal of the overall-smallest mantissa + # 0x800000. + gmin = max(128, (2**31 + mmin - 1) // mmax) + gmax = min(255, 2**31 // mmin) + + # For each of those table entries, compute the result of starting from that + # value and doing a Newton-Raphson iteration, with the mantissa at each end + # of the mantissa interval. One of these will be the worst possible error. + # Choose the table entry whose worst error is as small as possible. + # + # (To find the extreme values of a more general function on an interval, + # you must consider its values not only at the interval endpoints but also + # any turning points within the interval. Here, the function has only one + # turning point, and by construction it takes value 0 there, so we needn't + # worry.) + g = max( + range(gmin, gmax + 1), + key=lambda g: min( + (g * (2**32 - d * g) / 2**23 - 2**39 / d) for d in [mmin, mmax] + ), + ) + + print(f" .byte 0x{g:02x} // input [0x{mmin:06x},0x{mmax:06x}]" + f", candidate outputs [0x{gmin:02x},0x{gmax:02x}]" + ) + +*/ + + .p2align 2 // make sure we start on a 4-byte boundary, even in Thumb +LOCAL_LABEL(tab): + .byte 0xfe // input [0x800000,0x81ffff], candidate outputs [0xfd,0xff] + .byte 0xfa // input [0x820000,0x83ffff], candidate outputs [0xf9,0xfc] + .byte 0xf6 // input [0x840000,0x85ffff], candidate outputs [0xf5,0xf8] + .byte 0xf3 // input [0x860000,0x87ffff], candidate outputs [0xf1,0xf4] + .byte 0xef // input [0x880000,0x89ffff], candidate outputs [0xee,0xf0] + .byte 0xec // input [0x8a0000,0x8bffff], candidate outputs [0xeb,0xed] + .byte 0xe8 // input [0x8c0000,0x8dffff], candidate outputs [0xe7,0xea] + .byte 0xe5 // input [0x8e0000,0x8fffff], candidate outputs [0xe4,0xe6] + .byte 0xe2 // input [0x900000,0x91ffff], candidate outputs [0xe1,0xe3] + .byte 0xdf // input [0x920000,0x93ffff], candidate outputs [0xde,0xe0] + .byte 0xdc // input [0x940000,0x95ffff], candidate outputs [0xdb,0xdd] + .byte 0xd9 // input [0x960000,0x97ffff], candidate outputs [0xd8,0xda] + .byte 0xd6 // input [0x980000,0x99ffff], candidate outputs [0xd5,0xd7] + .byte 0xd3 // input [0x9a0000,0x9bffff], candidate outputs [0xd3,0xd4] + .byte 0xd1 // input [0x9c0000,0x9dffff], candidate outputs [0xd0,0xd2] + .byte 0xce // input [0x9e0000,0x9fffff], candidate outputs [0xcd,0xcf] + .byte 0xcc // input [0xa00000,0xa1ffff], candidate outputs [0xcb,0xcc] + .byte 0xc9 // input [0xa20000,0xa3ffff], candidate outputs [0xc8,0xca] + .byte 0xc7 // input [0xa40000,0xa5ffff], candidate outputs [0xc6,0xc7] + .byte 0xc4 // input [0xa60000,0xa7ffff], candidate outputs [0xc4,0xc5] + .byte 0xc2 // input [0xa80000,0xa9ffff], candidate outputs [0xc1,0xc3] + .byte 0xc0 // input [0xaa0000,0xabffff], candidate outputs [0xbf,0xc0] + .byte 0xbd // input [0xac0000,0xadffff], candidate outputs [0xbd,0xbe] + .byte 0xbb // input [0xae0000,0xafffff], candidate outputs [0xbb,0xbc] + .byte 0xb9 // input [0xb00000,0xb1ffff], candidate outputs [0xb9,0xba] + .byte 0xb7 // input [0xb20000,0xb3ffff], candidate outputs [0xb7,0xb8] + .byte 0xb5 // input [0xb40000,0xb5ffff], candidate outputs [0xb5,0xb6] + .byte 0xb3 // input [0xb60000,0xb7ffff], candidate outputs [0xb3,0xb4] + .byte 0xb1 // input [0xb80000,0xb9ffff], candidate outputs [0xb1,0xb2] + .byte 0xaf // input [0xba0000,0xbbffff], candidate outputs [0xaf,0xb0] + .byte 0xad // input [0xbc0000,0xbdffff], candidate outputs [0xad,0xae] + .byte 0xac // input [0xbe0000,0xbfffff], candidate outputs [0xab,0xac] + .byte 0xaa // input [0xc00000,0xc1ffff], candidate outputs [0xa9,0xaa] + .byte 0xa8 // input [0xc20000,0xc3ffff], candidate outputs [0xa8,0xa8] + .byte 0xa6 // input [0xc40000,0xc5ffff], candidate outputs [0xa6,0xa7] + .byte 0xa5 // input [0xc60000,0xc7ffff], candidate outputs [0xa4,0xa5] + .byte 0xa3 // input [0xc80000,0xc9ffff], candidate outputs [0xa3,0xa3] + .byte 0xa1 // input [0xca0000,0xcbffff], candidate outputs [0xa1,0xa2] + .byte 0xa0 // input [0xcc0000,0xcdffff], candidate outputs [0xa0,0xa0] + .byte 0x9e // input [0xce0000,0xcfffff], candidate outputs [0x9e,0x9f] + .byte 0x9d // input [0xd00000,0xd1ffff], candidate outputs [0x9d,0x9d] + .byte 0x9b // input [0xd20000,0xd3ffff], candidate outputs [0x9b,0x9c] + .byte 0x9a // input [0xd40000,0xd5ffff], candidate outputs [0x9a,0x9a] + .byte 0x98 // input [0xd60000,0xd7ffff], candidate outputs [0x98,0x99] + .byte 0x97 // input [0xd80000,0xd9ffff], candidate outputs [0x97,0x97] + .byte 0x96 // input [0xda0000,0xdbffff], candidate outputs [0x95,0x96] + .byte 0x94 // input [0xdc0000,0xddffff], candidate outputs [0x94,0x94] + .byte 0x93 // input [0xde0000,0xdfffff], candidate outputs [0x93,0x93] + .byte 0x92 // input [0xe00000,0xe1ffff], candidate outputs [0x91,0x92] + .byte 0x90 // input [0xe20000,0xe3ffff], candidate outputs [0x90,0x90] + .byte 0x8f // input [0xe40000,0xe5ffff], candidate outputs [0x8f,0x8f] + .byte 0x8e // input [0xe60000,0xe7ffff], candidate outputs [0x8e,0x8e] + .byte 0x8d // input [0xe80000,0xe9ffff], candidate outputs [0x8d,0x8d] + .byte 0x8b // input [0xea0000,0xebffff], candidate outputs [0x8b,0x8c] + .byte 0x8a // input [0xec0000,0xedffff], candidate outputs [0x8a,0x8a] + .byte 0x89 // input [0xee0000,0xefffff], candidate outputs [0x89,0x89] + .byte 0x88 // input [0xf00000,0xf1ffff], candidate outputs [0x88,0x88] + .byte 0x87 // input [0xf20000,0xf3ffff], candidate outputs [0x87,0x87] + .byte 0x86 // input [0xf40000,0xf5ffff], candidate outputs [0x86,0x86] + .byte 0x85 // input [0xf60000,0xf7ffff], candidate outputs [0x85,0x85] + .byte 0x84 // input [0xf80000,0xf9ffff], candidate outputs [0x84,0x84] + .byte 0x83 // input [0xfa0000,0xfbffff], candidate outputs [0x83,0x83] + .byte 0x82 // input [0xfc0000,0xfdffff], candidate outputs [0x82,0x82] + .byte 0x81 // input [0xfe0000,0xffffff], candidate outputs [0x80,0x81] + +END_COMPILERRT_FUNCTION(__aeabi_fdiv) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/lib/builtins/arm/fnan2.c b/compiler-rt/lib/builtins/arm/fnan2.c new file mode 100644 index 0000000000000..06bbd4339f171 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/fnan2.c @@ -0,0 +1,42 @@ +//===-- fnan2.c - Handle single-precision NaN inputs to binary operation --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This helper function is available for use by single-precision float +// arithmetic implementations to handle propagating NaNs from the input +// operands to the output, in a way that matches Arm hardware FP. +// +// On input, a and b are floating-point numbers in IEEE 754 encoding, and at +// least one of them must be a NaN. The return value is the correct output NaN. +// +// A signalling NaN in the input (with bit 22 clear) takes priority over any +// quiet NaN, and is adjusted on return by setting bit 22 to make it quiet. If +// both inputs are the same type of NaN then the first input takes priority: +// the input a is used instead of b. +// +//===----------------------------------------------------------------------===// + +#include + +uint32_t __compiler_rt_fnan2(uint32_t a, uint32_t b) { + // Make shifted-left copies of a and b to discard the sign bit. Then add 1 at + // the bit position where the quiet vs signalling bit ended up. This squashes + // all the signalling NaNs to the top of the range of 32-bit values, from + // 0xff800001 to 0xffffffff inclusive; meanwhile, all the quiet NaN values + // wrap round to the bottom, from 0 to 0x007fffff inclusive. So we can detect + // a signalling NaN by asking if it's greater than 0xff800000, and a quiet + // one by asking if it's less than 0x00800000. + uint32_t aadj = (a << 1) + 0x00800000; + uint32_t badj = (b << 1) + 0x00800000; + if (aadj > 0xff800000) // a is a signalling NaN? + return a | 0x00400000; // if so, return it with the quiet bit set + if (badj > 0xff800000) // b is a signalling NaN? + return b | 0x00400000; // if so, return it with the quiet bit set + if (aadj < 0x00800000) // a is a quiet NaN? + return a; // if so, return it + return b; // otherwise we expect b must be a quiet NaN +} diff --git a/compiler-rt/lib/builtins/arm/fnorm2.c b/compiler-rt/lib/builtins/arm/fnorm2.c new file mode 100644 index 0000000000000..29eba1cbde59d --- /dev/null +++ b/compiler-rt/lib/builtins/arm/fnorm2.c @@ -0,0 +1,62 @@ +//===-- fnorm2.c - Handle single-precision denormal inputs to binary op ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This helper function is available for use by single-precision float +// arithmetic implementations, to handle denormal inputs on entry by +// renormalizing the mantissa and modifying the exponent to match. +// +//===----------------------------------------------------------------------===// + +#include + +// Structure containing the function's inputs and outputs. +// +// On entry: a, b are two input floating-point numbers, still in IEEE 754 +// encoding. expa and expb are the 8-bit exponents of those numbers, extracted +// and shifted down to the low 8 bits of the word, with no other change. +// Neither value should be zero, or have the maximum exponent (indicating an +// infinity or NaN). +// +// On exit: each of a and b contains the mantissa of the input value, with the +// leading 1 bit made explicit, and shifted up to the top of the word. If expa +// was zero (indicating that a was denormal) then it is now represented as a +// normalized number with an out-of-range exponent (zero or negative). The same +// applies to expb and b. +struct fnorm2 { + uint32_t a, b, expa, expb; +}; + +void __compiler_rt_fnorm2(struct fnorm2 *values) { + // Shift the mantissas of a and b to the right place to follow a leading 1 in + // the top bit, if there is one. + values->a <<= 8; + values->b <<= 8; + + // Test if a is denormal. + if (values->expa == 0) { + // If so, decide how much further up to shift its mantissa, and adjust its + // exponent to match. This brings the leading 1 of the denormal mantissa to + // the top of values->a. + uint32_t shift = __builtin_clz(values->a); + values->a <<= shift; + values->expa = 1 - shift; + } else { + // Otherwise, leave the mantissa of a in its current position, and OR in + // the explicit leading 1. + values->a |= 0x80000000; + } + + // Do the same operation on b. + if (values->expb == 0) { + uint32_t shift = __builtin_clz(values->b); + values->b <<= shift; + values->expb = 1 - shift; + } else { + values->b |= 0x80000000; + } +} diff --git a/compiler-rt/lib/builtins/arm/funder.c b/compiler-rt/lib/builtins/arm/funder.c new file mode 100644 index 0000000000000..fd29e157328a3 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/funder.c @@ -0,0 +1,78 @@ +//===-- funder.c - Handle single-precision floating-point underflow -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This helper function is available for use by single-precision float +// arithmetic implementations to handle underflowed output values, if they were +// computed in the form of a normalized mantissa and an out-of-range exponent. +// +// On input: x should be a complete IEEE 754 floating-point value representing +// the desired output scaled up by 2^192 (the same value that would have been +// passed to an underflow trap handler in IEEE 754:1985). +// +// This isn't enough information to re-round to the correct output denormal +// without also knowing whether x itself has already been rounded, and which +// way. 'errsign' gives this information, by indicating the sign of the value +// (true result - x). That is, if errsign > 0 it means the true value was +// larger (x was rounded down); if errsign < 0 then x was rounded up; if +// errsign == 0 then x represents the _exact_ desired output value. +// +//===----------------------------------------------------------------------===// + +#include + +#define SIGNBIT 0x80000000 +#define MANTSIZE 23 +#define BIAS 0xc0 + +uint32_t __compiler_rt_funder(uint32_t x, uint32_t errsign) { + uint32_t sign = x & SIGNBIT; + uint32_t exponent = (x << 1) >> 24; + + // Rule out exponents so small (or large!) that no denormalisation + // is needed. + if (exponent > BIAS) { + // Exponent 0xc1 or above means a normalised number got here by + // mistake, so we just remove the 0xc0 exponent bias and go + // straight home. + return x - (BIAS << MANTSIZE); + } + uint32_t bits_lost = BIAS + 1 - exponent; + if (bits_lost > MANTSIZE + 1) { + // The implicit leading 1 of the intermediate value's mantissa is + // below the lowest mantissa bit of a denormal by at least 2 bits. + // Round down to 0 unconditionally. + return sign; + } + + // Make the full mantissa (with leading bit) at the top of the word. + uint32_t mantissa = 0x80000000 | (x << 8); + // Adjust by 1 depending on the sign of the error. + mantissa -= errsign >> 31; + mantissa += (-errsign) >> 31; + + // Shift down to the output position, keeping the bits shifted off. + uint32_t outmant, shifted_off; + if (bits_lost == MANTSIZE + 1) { + // Special case for the exponent where we have to shift the whole + // of 'mantissa' off the bottom of the word. + outmant = 0; + shifted_off = mantissa; + } else { + outmant = mantissa >> (8 + bits_lost); + shifted_off = mantissa << (32 - (8 + bits_lost)); + } + + // Re-round. + if (shifted_off >> 31) { + outmant++; + if (!(shifted_off << 1)) + outmant &= ~1; // halfway case: round to even + } + + return sign | outmant; +} diff --git a/compiler-rt/lib/builtins/arm/mulsf3.S b/compiler-rt/lib/builtins/arm/mulsf3.S new file mode 100644 index 0000000000000..346d3ed377c9c --- /dev/null +++ b/compiler-rt/lib/builtins/arm/mulsf3.S @@ -0,0 +1,319 @@ +//===-- mulsf3.S - single-precision floating point multiplication ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements single-precision soft-float multiplication with the +// IEEE-754 default rounding (to nearest, ties to even), in optimized AArch32 +// assembly language suitable to be built as either Arm or Thumb2. +// +//===----------------------------------------------------------------------===// + +#include "../assembly.h" + + + .syntax unified + .text + .p2align 2 + +#if __ARM_PCS_VFP +DEFINE_COMPILERRT_FUNCTION(__mulsf3) + push {r4, lr} + vmov r0, s0 + vmov r1, s1 + bl __aeabi_fmul + vmov s0, r0 + pop {r4, pc} +#else +DEFINE_COMPILERRT_FUNCTION_ALIAS(__mulsf3, __aeabi_fmul) +#endif + +DEFINE_COMPILERRT_FUNCTION(__aeabi_fmul) + + // Check if either input exponent is 00 or FF (i.e. not a normalized number), + // and if so, branch out of line. If we don't branch out of line, then we've + // also extracted the exponents of the input values r0/r1 into bits 16..23 of + // r2/r3. But if we do, then that hasn't necessarily been done (because the + // second AND might have been skipped). + mov r12, #0xFF0000 + ands r2, r12, r0, lsr #7 // sets Z if exponent of x is 0 + andsne r3, r12, r1, lsr #7 // otherwise, sets Z if exponent of y is 0 + teqne r2, r12 // otherwise, sets Z if exponent of x is FF + teqne r3, r12 // otherwise, sets Z if exponent of y is FF + beq LOCAL_LABEL(uncommon) // branch out of line to handle inf/NaN/0/denorm + + // Calculate the sign of the result, and put it in an unused bit of r2. + teq r0, r1 // sets N to the XOR of x and y's sign bits + orrmi r2, r2, #0x100 // if N set, set bit 8 of r2 + + // Move the input mantissas to the high end of r0/r1, each with its leading + // bit set explicitly, so that they're in the right form to be multiplied. + mov r12, #0x80000000 + orr r0, r12, r0, lsl #8 + orr r1, r12, r1, lsl #8 + + // Now we're ready to multiply mantissas. This is also the place we'll come + // back to after decoding denormal inputs. The denormal decoding will also + // have to set up the same register contents: + // - decoded fractions at the top of r0 and r1 + // - exponents in r2 and r3, starting at bit 16 + // - output sign in r2 bit 8 +LOCAL_LABEL(mul): + + // Here we multiply the mantissas, and compute the output exponent by adding + // the input exponents and rebiasing. These operations are interleaved to + // use a delay slot. + // + // The exponent is rebiased by subtracting 0x80, rather than the 0x7F you'd + // expect. That compensates for the leading bit of the mantissa overlapping + // it, when we recombine the exponent and mantissa by addition. + add r2, r2, r3 // r2 has sum of exponents, freeing up r3 + umull r1, r3, r0, r1 // r3:r1 has the double-width product + sub r2, r2, #(0x80 << 16) // rebias the summed exponent + + // Compress the double-word product into just the high-order word r3, by + // setting its bit 0 if any bit of the low-order word is nonzero. This + // changes the represented value, but not by nearly enough to affect + // rounding, because rounding only depends on the bit below the last output + // bit, and the general question of whether _any_ nonzero bit exists below + // that. + cmp r1, #0 // if low word of full product is nonzero + orrne r3, r3, #1 // then set LSB of high word + + // The two inputs to UMULL had their high bits set, that is, were at least + // 0x80000000. So the 64-bit product was at least 0x4000000000000000, i.e. + // the high bit of the product could be at the top of the word or one bit + // below. Check which, by experimentally shifting left, and then undoing it + // via RRX if we turned out to have shifted off a 1 bit. + lsls r3, r3, #1 // shift left, setting C to the bit shifted off + rrxcs r3, r3 // if that bit was 1, put it back again + + // That ensured the leading 1 bit of the product is now the top of r3, but + // also, set C if the leading 1 was _already_ in the top bit. So now we know + // whether to increment the exponent. The following instruction does the + // conditional increment (because it's ADC), but also, copies the exponent + // field from bit 16 of r2 into bit 0, so as to place it just below the + // output sign bit. + // + // So, if the number hasn't overflowed or underflowed, the low 9 bits of r2 + // are exactly what we need to combine with the rounded mantissa. But the + // full output exponent (with extra bits) is still available in the high half + // of r2, so that we can check _whether_ we overflowed or underflowed. + adc r2, r2, r2, asr #16 + + // Recombine the exponent and mantissa, doing most of the rounding as a side + // effect: we shift the mantissa right so as to put the round bit into C, and + // then we recombine with the exponent using ADC, to increment the mantissa + // if C was set. + movs r12, r3, lsr #8 + adc r0, r12, r2, lsl #23 + + // To complete the rounding, we must check for the round-to-even tiebreaking + // case, by checking if we're in the exact halfway case, which occurs if and + // only if we _did_ round up (we can tell this because C is still set from + // the MOVS), and also, no bit of r3 is set _below_ the round bit. + // + // We combine this with an overflow check, so that C ends up set if anything + // weird happened, and clear if we're completely finished and can return. + // + // The best instruction sequence for this part varies between Arm and Thumb. +#if !__thumb__ + // Arm state: if C was set then we check the low bits of r3, so that Z ends + // up set if we need to round to even. + // + // (We rely here on Z reliably being clear to begin with, because shifting + // down the output mantissa definitely gave a nonzero output. Also, the TST + // doesn't change C, so if Z does end up set, then C was also set.) + // + // Then, if we're not rounding to even, we do a CMP which sets C if there's + // been an overflow or an underflow. An overflow could occur for an output + // exponent as low as 0xFC, because we might increment the exponent by 1 when + // renormalizing, by another when recombining with the mantissa, and by one + // more if rounding up causes a carry off the top of the mantissa. An + // underflow occurs only if the output exponent is negative (because it's + // offset by 1, so an exponent of 0 will be incremented to 1), in which case + // the top 8 bits of r2 will all be set. Therefore, an unsigned comparison to + // see if r2 > 0xFC0000 will catch all overflow and underflow cases. It also + // catches a few very large cases that _don't_ quite overflow (exponents of + // 0xFC and above that don't get maximally unlucky); those will also be + // handled by the slow path. + tstcs r3, #0x7F + cmpne r2, #0xFC0000 +#else + // In Thumb, switching between different conditions has a higher cost due to + // the (implicit in this code) IT instructions, so we prefer a strategy that + // uses CC and CS conditions throughout, at the cost of requiring some extra + // cleanup instructions on the slow path. + // + // If C is set (and hence round-to-even is a possibility), the basic idea is + // to shift the full result word (r3) left by 25, leaving only its bottom 7 + // bits, which are now the top 7 bits; then we want to set C iff these are 0. + // + // The "CMP x,y" instruction sets C if y > x (as unsigned integers). So this + // could be done in one instruction if only we had a register to use as x, + // which has 0 in the top 7 bits and at least one nonzero. Then we could + // compare that against the shifted-up value of r3, setting C precisely if + // the top 7 bits of y are greater than 0. And happily, we _do_ have such a + // register! r12 contains the shifted-down mantissa, which is guaranteed to + // have a 1 in bit 23, and 0 above that. + // + // The shift of r3 happens only in the second operand of the compare, so we + // don't lose the original value of r3 in this process. + // + // The check for over/underflow is exactly as in the Arm branch above, except + // based on a different condition. + cmpcs r12, r3, lsl #25 // now C is set iff we're rounding to even + cmpcc r2, #0xFC0000 // and now it's also set if we've over/underflowed +#endif + + // That's all the checks for difficult cases done. If C is clear, we can + // return. + bxcc lr + + // Now the slower path begins. We have to recover enough information to + // handle all of round-to-even, overflow and underflow. + // + // Round to even is the most likely of these, so we detect it first and + // handle it as fast as possible. + +#if __thumb__ + // First, Thumb-specific compensation code. The Arm branch of the #if above + // will have set Z=0 to indicate round to even, but the Thumb branch didn't + // leave any unambiguous indicator of RTE, so we must retest by checking all + // the bits shifted off the bottom of the mantissa to see if they're exactly + // the half-way value. + lsl r12, r3, #24 // r12 = round bit and everything below + cmp r12, #0x80000000 // set Z if that is exactly 0x80000000 +#endif + + // Now Z is clear iff we have already rounded up and now must replace that + // with rounding to even, which is done by just clearing the low bit of the + // mantissa. + biceq r0, r0, #1 + + // Redo the over/underflow check (the same way as in both branches above), + // and if it doesn't report a danger, we can return the rounded-to-even + // answer. + cmp r2, #0xFC0000 // check for over/underflow + bxcc lr // and return if none. + + // Now we only have overflow and underflow left to handle. First, find out + // which we're looking at. This is easy by testing the top bit of r2, but + // even easier by using the fact that the possible positive and negative + // values of r2 are widely enough separated that the 0xFC0000 subtracted by + // the CMP above won't have made any difference. So the N flag output from + // that comparison _already_ tells us which condition we have: if N is set we + // have underflow, and if N is clear, overflow. + bpl LOCAL_LABEL(overflow) + + // Here we're handling underflow. + + // Add the IEEE 754:1985 exponent bias which funder will expect. This also + // brings the exponent back into a range where it can't possibly have carried + // into the sign bit, so the output sign will now be right. + add r0, r0, #(0xC0 << 23) + + // Determine whether we rounded up, down or not at all. + lsls r2, r3, #1 // input mantissa, without its leading 1 + subs r1, r2, r0, lsl #9 // subtract the output mantissa (likewise) + + // And let funder handle the rest. + b SYMBOL_NAME(__compiler_rt_funder) + +LOCAL_LABEL(overflow): + // We come here to handle overflow, but it's not guaranteed that an overflow + // has actually happened: our check on the fast path erred on the side of + // caution, by catching any output exponent that _could_ cause an overflow. + // So first check whether this really is an overflow, by extracting the + // output exponent. Exponent 0xFF, or anything that wrapped round to having + // the high bit clear, are overflows; 0xFE down to 0xFC are not overflows. + // + // The value in r0 is correct to return, if there's no overflow. + add r12, r0, #(1 << 23) // add 1 to the exponent so 0xFF wraps to 0 + movs r12, r12, lsl #1 // test the top bit of the modified value + bxmi lr // if top bit is still 1, not an overflow + + // This is an overflow, so we need to replace it with an appropriately signed + // infinity. First we correct the sign by applying a downward bias to the + // exponent (the one suggested in IEEE 754:1985, which was chosen to bring + // all possible overflowed results back into range). + subs r0, r0, #(0xC0 << 23) + + // Now the sign bit of r0 is correct. Replace everything else with the + // encoding of an infinity. + mov r1, #0xFF + and r0, r0, #0x80000000 + orr r0, r0, r1, lsl #23 + bx lr + +LOCAL_LABEL(uncommon): + // Handle zeros, denorms, infinities and NaNs. We arrive here knowing that + // we've at least done the first _two_ instructions from the entry point, + // even if all the rest were skipped. So r2 contains the sign and exponent of + // x in bits 16..23, and r12 = 0xFF << 16. + // + // So, first repeat some instructions from the prologue, which were either + // conditionally skipped in the sequence leading to the branch, or skipped + // because they happened after the branch. + and r3, r12, r1, lsr #7 // get exponent of y in r3 bits 16..23 + teq r0, r1 // calculate the sign of the result + orrmi r2, r2, #0x100 // and put it in bit 8 of r2 as before + + // Check for infinities and NaNs, by testing each of r2,r3 to see if it's at + // least 0xFF0000 (hence the exponent field is equal to 0xFF). + cmp r2, r12 + cmplo r3, r12 + bhs LOCAL_LABEL(inf_NaN) + + // If we didn't take that branch, then we have only finite numbers, but at + // least one is denormal or zero. A zero makes the result easy (and also is a + // more likely input than a denormal), so check those first, as fast as + // possible. + movs r12, r0, lsl #1 // Z set if x == 0 + movsne r12, r1, lsl #1 // now Z set if either input is 0 + moveq r0, r2, lsl #23 // in either case, make 0 of the output sign + bxeq lr // and return it + + // Now we know we only have denormals to deal with. Call fnorm2 to sort + // them out, and rejoin the main code path above. + and r12, r2, #0x100 // save the result sign from r2 + lsr r2, #16 // shift extracted exponents down to bit 0 + lsr r3, #16 // where fnorm2 will expect them + push {r0, r1, r2, r3, r12, lr} + mov r0, sp // tell fnorm2 where to find its data + bl SYMBOL_NAME(__compiler_rt_fnorm2) + pop {r0, r1, r2, r3, r12, lr} + lsl r3, #16 // shift exponents back up to bit 16 + orr r2, r12, r2, lsl #16 // and put the result sign back in r2 + b LOCAL_LABEL(mul) + +LOCAL_LABEL(inf_NaN): + // We come here if at least one input is a NaN or infinity. If either or both + // inputs are NaN then we hand off to fnan2 which will propagate a NaN from + // the input; otherwise any multiplication involving infinity returns + // infinity, unless it's infinity * 0 which is an invalid operation and + // returns NaN again. + mov r12, #0xFF000000 + cmp r12, r0, lsl #1 // if (r0 << 1) > 0xFF000000, r0 is a NaN + blo SYMBOL_NAME(__compiler_rt_fnan2) + cmp r12, r1, lsl #1 + blo SYMBOL_NAME(__compiler_rt_fnan2) + + // NaNs are dealt with, so now we have at least one infinity. Check if the + // other operand is 0. This is conveniently done by XORing the two: because + // we know that the low 31 bits of one operand are exactly 0x7F800000, we can + // test if the low 31 bits of the other one are all 0 by checking whether the + // low 31 bits of (x XOR y) equal 0x7F800000. + eor r3, r0, r1 + cmp r12, r3, lsl #1 // if inf * 0, this sets Z + lsr r0, r12, #1 // set up return value of +infinity + orrne r0, r0, r2, lsl #23 // if not inf * 0, put on the output sign + orreq r0, r0, #0x400000 // otherwise, set the 'quiet NaN' bit + bx lr // and return + +END_COMPILERRT_FUNCTION(__aeabi_fmul) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S b/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S new file mode 100644 index 0000000000000..f2ede1013a9e6 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S @@ -0,0 +1,251 @@ +//===-- mulsf3.S - single-precision floating point multiplication ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements single-precision soft-float multiplication with the +// IEEE-754 default rounding (to nearest, ties to even), in optimized Thumb1 +// assembly language. +// +//===----------------------------------------------------------------------===// + +#include "../../assembly.h" + + .syntax unified + .text + .thumb + .p2align 2 + +DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fmul, __mulsf3) + +DEFINE_COMPILERRT_THUMB_FUNCTION(__mulsf3) + push {r4,r5,r6,lr} + + // Get exponents of the inputs, and check for uncommon values. In the process + // of this we also compute the sign, because it's marginally quicker that + // way. + lsls r2, r0, #1 + adcs r4, r4, r4 // set r4[0] to sign bit of x + lsls r3, r1, #1 + adcs r4, r4, r3 // set r4[0] to the output sign + lsrs r2, r2, #24 + beq LOCAL_LABEL(zerodenorm0) // still do the next LSRS + lsrs r3, r3, #24 + beq LOCAL_LABEL(zerodenorm) + cmp r2, #255 + beq LOCAL_LABEL(naninf) + cmp r3, #255 + beq LOCAL_LABEL(naninf) + // Compute the output exponent. We'll be generating our product _without_ the + // leading bit, so we subtract 0x7f rather than 0x80. + adds r2, r2, r3 + subs r2, r2, #0x7f + // Blank off everything above the mantissas. + lsls r0, r0, #9 + lsls r1, r1, #9 +LOCAL_LABEL(normalised): // we may come back here from zerodenorm + lsrs r0, r0, #9 + lsrs r1, r1, #9 + // Multiply. r0 and r1 are the mantissas of the inputs but without their + // leading bits, so the product we want in principle is P=(r0+2^23)(r1+2^23). + // P is at most (2^24-1)^2 < 2^48, so it fits in a word and a half. + // + // The technique below will actually compute P - 2^46, by not adding on the + // term where the two 2^23 are multiplied. The 48-bit result will be + // delivered in two output registers, one containing its bottom 32 bits and + // the other containing the top 32, so they overlap in the middle 16 bits. + // This is done using only two multiply instructions and some bookkeeping. + // + // In the comments I'll write X and Y for the original input mantissas (again + // without their leading bits). I'll also decompose them as X = xh + xl and + // Y = yh + yl, where xl and yl are in the range 0..2^8-1 and xh,yh are + // multiples of 2^8. + adds r5, r0, r1 + lsls r5, r5, #7 // r5 = (X+Y) << 7 + movs r6, r0 + muls r6, r1, r6 // r6 is congruent mod 2^32 to X*Y + lsrs r0, r0, #8 + lsrs r1, r1, #8 + muls r0, r1, r0 + lsls r1, r0, #16 // r1 is congruent mod 2^32 to xh*yh + subs r3, r6, r1 // now r3 is congruent mod 2^32 to + // (X*Y) - (xh*yh) = xh*yl + xl*yh + xl*yl + // and hence, since that is at most 0xfeff0001, + // is _exactly_ equal to that + adds r0, r0, r5 // r0 is now (xh*yh + (X+Y)<<23) >> 16 + lsrs r1, r3, #16 // r1 is the top 16 bits of r3, i.e. + // (xh*yl + xl*yh + xl*yl) >> 16 + adds r3, r0, r1 // now r3 equals + // (xh*yh + xh*yl + xl*yh + xl*yl + (X+Y)<<23) >> 16 + // i.e. (X*Y + (X+Y)<<23) >> 16, + // i.e. (the right answer) >> 16. + // Meanwhile, r6 is exactly the bottom 32 bits of the + // right answer. + // Renormalise if necessary. + lsrs r1, r3, #30 + beq LOCAL_LABEL(norenorm) + // Here we have to do something fiddly. Renormalisation would be a trivial + // job if we had the leading mantissa bit - just note that it's one bit + // position above where it should be, and shift right by one. But without + // that bit, we currently have (2x - 2^30), and we want (x - 2^30); just + // shifting right would of course give us (x - 2^29), so we must subtract an + // extra 2^29 to fix this up. + lsrs r3, r3, #1 + movs r1, #1 + lsls r1, r1, #29 + subs r3, r3, r1 + adds r2, r2, #1 +LOCAL_LABEL(norenorm): + // Round and shift down to the right bit position. + lsrs r0, r3, #7 // round bit goes into the carry flag + bcc LOCAL_LABEL(rounded) + adds r0, r0, #1 + // In the round-up branch, we must also check if we have to round to even, by + // testing all the bits below the round bit. We will normally not expect to, + // so we do RTE by branching out of line and back again to avoid spending a + // branch in the common case. + lsls r5, r3, #32-7+1 // check the bits shifted out of r3 above + bne LOCAL_LABEL(rounded) // if any is nonzero, we're not rounding to even + lsls r5, r6, #15 // check the bottom 17 bits of the low-order 32 + // (enough to overlap r3 even if we renormalised) + beq LOCAL_LABEL(rte) // if any is nonzero, fall through, else RTE +LOCAL_LABEL(rounded): + // Put on the sign and exponent, check for underflow and overflow, and + // return. + // + // Underflow occurs iff r2 (the output exponent) <= 0. Overflow occurs if + // it's >= 0xFF. (Also if it's 0xFE and we rounded up to overflow, but since + // this code doesn't report exceptions, we can ignore this case because it'll + // happen to return the right answer regardless). So we handle most of this + // via an unsigned comparison against 0xFF, which leaves the one case of a + // zero exponent that we have to filter separately by testing the Z flag + // after we shift the exponent back up into place. + cmp r2, #0xFF // check for most over/underflows + bhs LOCAL_LABEL(outflow) // ... and branch out of line for them + lsls r5, r2, #23 // shift the exponent into its output location + beq LOCAL_LABEL(outflow) // ... and branch again if it was 0 + lsls r4, r4, #31 // shift the output sign into place + orrs r0, r0, r4 // and OR it in to the output + adds r0, r0, r5 // OR in the mantissa + pop {r4,r5,r6,pc} // and return + +LOCAL_LABEL(rte): + // Out-of-line handler for the round-to-even case. Clear the low mantissa bit + // and go back to the post-rounding code. + movs r5, #1 + bics r0, r0, r5 + b LOCAL_LABEL(rounded) + +LOCAL_LABEL(outflow): + cmp r2, #0 + bgt LOCAL_LABEL(overflow) + // To handle underflow, we construct an intermediate value in the IEEE 754 + // style (using our existing full-length mantissa, and bias the exponent by + // +0xC0), and indicate whether that intermediate was rounded up, down or not + // at all. Then call the helper function funder, which will denormalise and + // re-round correctly. + lsls r1, r0, #7 // shift up the post-rounding mantissa + subs r1, r3, r1 // and subtract it from the pre-rounding version + lsls r6, r6, #15 + cmp r6, #1 // if the rest of the low bits are nonzero + adcs r1, r1, r1 // then set an extra bit at the bottom + + lsls r4, r4, #31 + orrs r0, r0, r4 // put on the sign + adds r2, r2, #192 // bias the exponent + lsls r3, r2, #23 + adds r0, r0, r3 // put on the biased exponent + + bl SYMBOL_NAME(__compiler_rt_funder) + pop {r4,r5,r6,pc} + +LOCAL_LABEL(overflow): + // Handle overflow by returning an infinity of the correct sign. + lsls r4, r4, #8 // move the sign up to bit 8 + movs r0, #0xff + orrs r0, r0, r4 // fill in an exponent just below it + lsls r0, r0, #23 // and shift those 9 bits up to the top of the word + pop {r4,r5,r6,pc} + + // We come here if there's at least one zero or denormal. On the fast path + // above, it was convenient to check these before checking NaNs and + // infinities, but NaNs take precedence, so now we're off the fast path, we + // must still check for those. + // + // At the main entry point 'zerodenorm' we want r2 and r3 to be the two input + // exponents. So if we branched after shifting-and-checking r2, we come to + // this earlier entry point 'zerodenorm0' so that we still shift r3. +LOCAL_LABEL(zerodenorm0): + lsrs r3, r3, #24 +LOCAL_LABEL(zerodenorm): + cmp r2, #255 + beq LOCAL_LABEL(naninf) + cmp r3, #255 + beq LOCAL_LABEL(naninf) + // Now we know we have at least one zero or denormal, and no NaN or infinity. + // Check if either input is actually zero. We've ruled out 0 * infinity by + // this point, so any zero input means we return zero of the correct sign. + lsls r6, r0, #1 // is one input zero? + beq LOCAL_LABEL(zero) // yes, go and return zero + lsls r6, r1, #1 // is the other one zero? + bne LOCAL_LABEL(denorm) // if not, one must have been a denormal +LOCAL_LABEL(zero): + lsls r0, r4, #31 // shift up the output sign to make the return value + pop {r4,r5,r6,pc} + + // Handle denormals via the helper function fnorm2, which will break both + // inputs up into mantissa and exponent, renormalising and generating a + // negative exponent if necessary. +LOCAL_LABEL(denorm): + push {r0,r1,r2,r3} + mov r0, sp + bl SYMBOL_NAME(__compiler_rt_fnorm2) + pop {r0,r1,r2,r3} + // Convert fnorm2's return values into the right form to rejoin the main + // code path. + lsls r0, r0, #1 + lsls r1, r1, #1 + adds r2, r2, r3 + subs r2, r2, #0x7f + b LOCAL_LABEL(normalised) + + // We come here if at least one input is a NaN or infinity. There may still + // be zeroes (or denormals, though they make no difference at this stage). +LOCAL_LABEL(naninf): + movs r6, #0xff + lsls r6, r6, #24 + lsls r5, r0, #1 + cmp r5, r6 + bhi LOCAL_LABEL(nan) // first operand is a NaN + lsls r5, r1, #1 + cmp r5, r6 + bhi LOCAL_LABEL(nan) // second operand is a NaN + + // We know we have at least one infinity, and no NaNs. We might also have a + // zero, in which case we return the default quiet NaN. + lsls r6, r0, #1 + beq LOCAL_LABEL(infzero) // if r0 is a zero, r1 must be inf + lsls r6, r1, #1 + beq LOCAL_LABEL(infzero) // if r1 is a zero, r0 must be inf + // Otherwise we have infinity * infinity, or infinity * finite. Just return + // an appropriately signed infinity. + b LOCAL_LABEL(overflow) // reuse the code there + + // We come here if at least one input is a NaN. Hand off to fnan2, which + // propagates an appropriate NaN to the output, dealing with the special + // cases of signalling/quiet NaNs. +LOCAL_LABEL(nan): + bl SYMBOL_NAME(__compiler_rt_fnan2) + pop {r4,r5,r6,pc} + + // Return a quiet NaN as the result of infinity * zero. +LOCAL_LABEL(infzero): + ldr r0, =0x7fc00000 + pop {r4,r5,r6,pc} + +END_COMPILERRT_FUNCTION(__mulsf3) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/test/builtins/CMakeLists.txt b/compiler-rt/test/builtins/CMakeLists.txt index 63f4c94605c90..8e3cb35183ba7 100644 --- a/compiler-rt/test/builtins/CMakeLists.txt +++ b/compiler-rt/test/builtins/CMakeLists.txt @@ -35,6 +35,10 @@ if(APPLE) darwin_filter_host_archs(BUILTIN_SUPPORTED_ARCH BUILTIN_TEST_ARCH) endif() +if(COMPILER_RT_ARM_OPTIMIZED_FP) + list(APPEND BUILTINS_TEST_TARGET_CFLAGS -DCOMPILER_RT_ARM_OPTIMIZED_FP) +endif() + foreach(arch ${BUILTIN_TEST_ARCH}) set(BUILTINS_TEST_TARGET_ARCH ${arch}) string(TOLOWER "-${arch}-${OS_NAME}" BUILTINS_TEST_CONFIG_SUFFIX) diff --git a/compiler-rt/test/builtins/Unit/divsf3_test.c b/compiler-rt/test/builtins/Unit/divsf3_test.c index f8cb6169ac283..12c5df5fdaae1 100644 --- a/compiler-rt/test/builtins/Unit/divsf3_test.c +++ b/compiler-rt/test/builtins/Unit/divsf3_test.c @@ -1,115 +1,428 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + // RUN: %clang_builtins %s %librt -o %t && %run %t // REQUIRES: librt_has_divsf3 #include "int_lib.h" +#include #include #include "fp_test.h" +// By default this test uses compareResultF to check the returned floats, which +// accepts any returned NaN if the expected result is the canonical NaN value +// 0x7fc00000. For the Arm optimized FP implementation, which commits to a more +// detailed handling of NaNs, we tighten up the check and include some extra +// test cases specific to that NaN policy. +#if (__arm__ && !(__thumb__ && !__thumb2__)) && COMPILER_RT_ARM_OPTIMIZED_FP +# define EXPECT_EXACT_RESULTS +# define ARM_NAN_HANDLING +#endif + // Returns: a / b COMPILER_RT_ABI float __divsf3(float a, float b); -int test__divsf3(float a, float b, uint32_t expected) -{ - float x = __divsf3(a, b); - int ret = compareResultF(x, expected); +int test__divsf3(uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) { + float a = fromRep32(a_rep), b = fromRep32(b_rep); + float x = __divsf3(a, b); +#ifdef EXPECT_EXACT_RESULTS + int ret = toRep32(x) == expected_rep; +#else + int ret = compareResultF(x, expected_rep); +#endif - if (ret){ - printf("error in test__divsf3(%.20e, %.20e) = %.20e, " - "expected %.20e\n", a, b, x, - fromRep32(expected)); - } - return ret; + if (ret) { + printf("error in test__divsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32 + ", expected %08" PRIx32 "\n", + a_rep, b_rep, toRep32(x), expected_rep); + } + return ret; } -int main() -{ - // Returned NaNs are assumed to be qNaN by default - - // qNaN / any = qNaN - if (test__divsf3(makeQNaN32(), 3.F, UINT32_C(0x7fc00000))) - return 1; - // NaN / any = NaN - if (test__divsf3(makeNaN32(UINT32_C(0x123)), 3.F, UINT32_C(0x7fc00000))) - return 1; - // any / qNaN = qNaN - if (test__divsf3(3.F, makeQNaN32(), UINT32_C(0x7fc00000))) - return 1; - // any / NaN = NaN - if (test__divsf3(3.F, makeNaN32(UINT32_C(0x123)), UINT32_C(0x7fc00000))) - return 1; - - // +Inf / positive = +Inf - if (test__divsf3(makeInf32(), 3.F, UINT32_C(0x7f800000))) - return 1; - // +Inf / negative = -Inf - if (test__divsf3(makeInf32(), -3.F, UINT32_C(0xff800000))) - return 1; - // -Inf / positive = -Inf - if (test__divsf3(makeNegativeInf32(), 3.F, UINT32_C(0xff800000))) - return 1; - // -Inf / negative = +Inf - if (test__divsf3(makeNegativeInf32(), -3.F, UINT32_C(0x7f800000))) - return 1; - - // Inf / Inf = NaN - if (test__divsf3(makeInf32(), makeInf32(), UINT32_C(0x7fc00000))) - return 1; - // 0.0 / 0.0 = NaN - if (test__divsf3(+0x0.0p+0F, +0x0.0p+0F, UINT32_C(0x7fc00000))) - return 1; - // +0.0 / +Inf = +0.0 - if (test__divsf3(+0x0.0p+0F, makeInf32(), UINT32_C(0x0))) - return 1; - // +Inf / +0.0 = +Inf - if (test__divsf3(makeInf32(), +0x0.0p+0F, UINT32_C(0x7f800000))) - return 1; - - // positive / +0.0 = +Inf - if (test__divsf3(+1.F, +0x0.0p+0F, UINT32_C(0x7f800000))) - return 1; - // positive / -0.0 = -Inf - if (test__divsf3(+1.F, -0x0.0p+0F, UINT32_C(0xff800000))) - return 1; - // negative / +0.0 = -Inf - if (test__divsf3(-1.F, +0x0.0p+0F, UINT32_C(0xff800000))) - return 1; - // negative / -0.0 = +Inf - if (test__divsf3(-1.F, -0x0.0p+0F, UINT32_C(0x7f800000))) - return 1; - - // 1/3 - if (test__divsf3(1.F, 3.F, UINT32_C(0x3eaaaaab))) - return 1; - // smallest normal result - if (test__divsf3(0x1.0p-125F, 2.F, UINT32_C(0x00800000))) - return 1; +int main(void) { + int status = 0; - // divisor is exactly 1.0 - if (test__divsf3(0x1.0p+0F, 0x1.0p+0F, UINT32_C(0x3f800000))) - return 1; - // divisor is truncated to exactly 1.0 in UQ1.15 - if (test__divsf3(0x1.0p+0F, 0x1.0001p+0F, UINT32_C(0x3f7fff00))) - return 1; + status |= test__divsf3(0x00000000, 0x00000001, 0x00000000); + status |= test__divsf3(0x00000000, 0x007fffff, 0x00000000); + status |= test__divsf3(0x00000000, 0x00800000, 0x00000000); + status |= test__divsf3(0x00000000, 0x00ffffff, 0x00000000); + status |= test__divsf3(0x00000000, 0x3f800000, 0x00000000); + status |= test__divsf3(0x00000000, 0x40a00000, 0x00000000); + status |= test__divsf3(0x00000000, 0x7effffff, 0x00000000); + status |= test__divsf3(0x00000000, 0x7f000000, 0x00000000); + status |= test__divsf3(0x00000000, 0x7f800000, 0x00000000); + status |= test__divsf3(0x00000000, 0x80000002, 0x80000000); + status |= test__divsf3(0x00000000, 0x807fffff, 0x80000000); + status |= test__divsf3(0x00000000, 0x80800001, 0x80000000); + status |= test__divsf3(0x00000000, 0x81000000, 0x80000000); + status |= test__divsf3(0x00000000, 0xc0400000, 0x80000000); + status |= test__divsf3(0x00000000, 0xc0e00000, 0x80000000); + status |= test__divsf3(0x00000000, 0xfe7fffff, 0x80000000); + status |= test__divsf3(0x00000000, 0xff000000, 0x80000000); + status |= test__divsf3(0x00000000, 0xff800000, 0x80000000); + status |= test__divsf3(0x00000001, 0x00000000, 0x7f800000); + status |= test__divsf3(0x00000001, 0x3e000000, 0x00000008); + status |= test__divsf3(0x00000001, 0x3f000000, 0x00000002); + status |= test__divsf3(0x00000001, 0x40000000, 0x00000000); + status |= test__divsf3(0x00000001, 0x7f7fffff, 0x00000000); + status |= test__divsf3(0x00000001, 0x7f800000, 0x00000000); + status |= test__divsf3(0x00000001, 0xc0000000, 0x80000000); + status |= test__divsf3(0x00000001, 0xff7fffff, 0x80000000); + status |= test__divsf3(0x00000002, 0x80000000, 0xff800000); + status |= test__divsf3(0x00000002, 0xff800000, 0x80000000); + status |= test__divsf3(0x00000009, 0x41100000, 0x00000001); + status |= test__divsf3(0x00000009, 0xc1100000, 0x80000001); + status |= test__divsf3(0x007ffff7, 0x3f7ffffe, 0x007ffff8); + status |= test__divsf3(0x007ffffe, 0x3f7ffffe, 0x007fffff); + status |= test__divsf3(0x007fffff, 0x00000000, 0x7f800000); + status |= test__divsf3(0x007fffff, 0x3b000000, 0x04fffffe); + status |= test__divsf3(0x007fffff, 0x3f000000, 0x00fffffe); + status |= test__divsf3(0x007fffff, 0x3f800000, 0x007fffff); + status |= test__divsf3(0x007fffff, 0x3f800002, 0x007ffffd); + status |= test__divsf3(0x007fffff, 0x7f800000, 0x00000000); + status |= test__divsf3(0x007fffff, 0x80000000, 0xff800000); + status |= test__divsf3(0x007fffff, 0xbf800000, 0x807fffff); + status |= test__divsf3(0x007fffff, 0xff800000, 0x80000000); + status |= test__divsf3(0x00800000, 0x00000000, 0x7f800000); + status |= test__divsf3(0x00800000, 0x3f800001, 0x007fffff); + status |= test__divsf3(0x00800000, 0x7f800000, 0x00000000); + status |= test__divsf3(0x00800001, 0x3f800002, 0x007fffff); + status |= test__divsf3(0x00800001, 0x80000000, 0xff800000); + status |= test__divsf3(0x00800001, 0xff800000, 0x80000000); + status |= test__divsf3(0x00800002, 0x3f800006, 0x007ffffc); + status |= test__divsf3(0x00fffffe, 0x40000000, 0x007fffff); + status |= test__divsf3(0x00ffffff, 0x00000000, 0x7f800000); + status |= test__divsf3(0x00ffffff, 0x40000000, 0x00800000); + status |= test__divsf3(0x00ffffff, 0x7f800000, 0x00000000); + status |= test__divsf3(0x01000000, 0x00800000, 0x40000000); + status |= test__divsf3(0x01000000, 0x80000000, 0xff800000); + status |= test__divsf3(0x01000000, 0xc0000000, 0x80800000); + status |= test__divsf3(0x01000000, 0xff800000, 0x80000000); + status |= test__divsf3(0x01000001, 0x00800001, 0x40000000); + status |= test__divsf3(0x01000001, 0xc0000000, 0x80800001); + status |= test__divsf3(0x01000003, 0x80800003, 0xc0000000); + status |= test__divsf3(0x01000003, 0xc0000000, 0x80800003); + status |= test__divsf3(0x3f7ffff7, 0x3f7ffffb, 0x3f7ffffc); + status |= test__divsf3(0x3f7ffff7, 0x3f7ffffe, 0x3f7ffff9); + status |= test__divsf3(0x3f7ffff8, 0x3f7ffffc, 0x3f7ffffc); + status |= test__divsf3(0x3f7ffff8, 0x3f7ffffd, 0x3f7ffffb); + status |= test__divsf3(0x3f7ffffa, 0x3f7ffff9, 0x3f800001); + status |= test__divsf3(0x3f7ffffb, 0x3f7ffff9, 0x3f800001); + status |= test__divsf3(0x3f7ffffc, 0x3f7ffff9, 0x3f800002); + status |= test__divsf3(0x3f7ffffc, 0x3f7ffffd, 0x3f7fffff); + status |= test__divsf3(0x3f7ffffc, 0x3f7ffffe, 0x3f7ffffe); + status |= test__divsf3(0x3f7ffffc, 0x3f7fffff, 0x3f7ffffd); + status |= test__divsf3(0x3f7ffffc, 0x3f800001, 0x3f7ffffa); + status |= test__divsf3(0x3f7ffffd, 0x3f7ffff9, 0x3f800002); + status |= test__divsf3(0x3f7ffffd, 0x3f7ffffc, 0x3f800001); + status |= test__divsf3(0x3f7ffffd, 0x3f7ffffe, 0x3f7fffff); + status |= test__divsf3(0x3f7ffffd, 0x3f7fffff, 0x3f7ffffe); + status |= test__divsf3(0x3f7ffffd, 0x3f800001, 0x3f7ffffb); + status |= test__divsf3(0x3f7ffffd, 0x3f800002, 0x3f7ffff9); + status |= test__divsf3(0x3f7ffffe, 0x3f7ffff9, 0x3f800003); + status |= test__divsf3(0x3f7ffffe, 0x3f7ffffc, 0x3f800001); + status |= test__divsf3(0x3f7ffffe, 0x3f7ffffd, 0x3f800001); + status |= test__divsf3(0x3f7ffffe, 0x3f7fffff, 0x3f7fffff); + status |= test__divsf3(0x3f7ffffe, 0x3f800001, 0x3f7ffffc); + status |= test__divsf3(0x3f7ffffe, 0x3f800002, 0x3f7ffffa); + status |= test__divsf3(0x3f7ffffe, 0x3f800003, 0x3f7ffff8); + status |= test__divsf3(0x3f7fffff, 0x3f7ffff9, 0x3f800003); + status |= test__divsf3(0x3f7fffff, 0x3f7ffffc, 0x3f800002); + status |= test__divsf3(0x3f7fffff, 0x3f7ffffd, 0x3f800001); + status |= test__divsf3(0x3f7fffff, 0x3f7ffffe, 0x3f800001); + status |= test__divsf3(0x3f7fffff, 0x3f800001, 0x3f7ffffd); + status |= test__divsf3(0x3f7fffff, 0x3f800002, 0x3f7ffffb); + status |= test__divsf3(0x3f7fffff, 0x3f800003, 0x3f7ffff9); + status |= test__divsf3(0x3f7fffff, 0x3f800004, 0x3f7ffff7); + status |= test__divsf3(0x3f800000, 0x00000000, 0x7f800000); + status |= test__divsf3(0x3f800000, 0x3f7ffff7, 0x3f800005); + status |= test__divsf3(0x3f800000, 0x3f7ffff8, 0x3f800004); + status |= test__divsf3(0x3f800000, 0x3f7ffffb, 0x3f800003); + status |= test__divsf3(0x3f800000, 0x3f7ffffc, 0x3f800002); + status |= test__divsf3(0x3f800000, 0x3f7ffffd, 0x3f800002); + status |= test__divsf3(0x3f800000, 0x3f7ffffe, 0x3f800001); + status |= test__divsf3(0x3f800000, 0x3f7fffff, 0x3f800001); + status |= test__divsf3(0x3f800000, 0x3f800000, 0x3f800000); + status |= test__divsf3(0x3f800000, 0x3f800001, 0x3f7ffffe); + status |= test__divsf3(0x3f800000, 0x3f800002, 0x3f7ffffc); + status |= test__divsf3(0x3f800000, 0x3f800003, 0x3f7ffffa); + status |= test__divsf3(0x3f800000, 0x3f800004, 0x3f7ffff8); + status |= test__divsf3(0x3f800000, 0x7f800000, 0x00000000); + status |= test__divsf3(0x3f800001, 0x3f7ffffb, 0x3f800004); + status |= test__divsf3(0x3f800001, 0x3f7ffffd, 0x3f800003); + status |= test__divsf3(0x3f800001, 0x3f7ffffe, 0x3f800002); + status |= test__divsf3(0x3f800001, 0x3f7fffff, 0x3f800002); + status |= test__divsf3(0x3f800001, 0x3f800002, 0x3f7ffffe); + status |= test__divsf3(0x3f800001, 0x3f800003, 0x3f7ffffc); + status |= test__divsf3(0x3f800002, 0x3f7ffffc, 0x3f800004); + status |= test__divsf3(0x3f800002, 0x3f7ffffd, 0x3f800004); + status |= test__divsf3(0x3f800002, 0x3f7ffffe, 0x3f800003); + status |= test__divsf3(0x3f800002, 0x3f7fffff, 0x3f800003); + status |= test__divsf3(0x3f800002, 0x3f800001, 0x3f800001); + status |= test__divsf3(0x3f800002, 0x3f800003, 0x3f7ffffe); + status |= test__divsf3(0x3f800003, 0x3f7ffffd, 0x3f800005); + status |= test__divsf3(0x3f800003, 0x3f7ffffe, 0x3f800004); + status |= test__divsf3(0x3f800003, 0x3f7fffff, 0x3f800004); + status |= test__divsf3(0x3f800003, 0x3f800001, 0x3f800002); + status |= test__divsf3(0x3f800004, 0x3f7ffffe, 0x3f800005); + status |= test__divsf3(0x3f800004, 0x3f800001, 0x3f800003); + status |= test__divsf3(0x3f800004, 0x3f800007, 0x3f7ffffa); + status |= test__divsf3(0x3f800005, 0x3f7fffff, 0x3f800006); + status |= test__divsf3(0x3f800006, 0x3f800008, 0x3f7ffffc); + status |= test__divsf3(0x3f800007, 0x3f800002, 0x3f800005); + status |= test__divsf3(0x3f800009, 0x3f800008, 0x3f800001); + status |= test__divsf3(0x40000000, 0x3f800000, 0x40000000); + status |= test__divsf3(0x40000000, 0xbf800000, 0xc0000000); + status |= test__divsf3(0x40400000, 0x80000000, 0xff800000); + status |= test__divsf3(0x40400000, 0xc0400000, 0xbf800000); + status |= test__divsf3(0x40400000, 0xff800000, 0x80000000); + status |= test__divsf3(0x40a00000, 0x00000000, 0x7f800000); + status |= test__divsf3(0x40a00000, 0x40a00000, 0x3f800000); + status |= test__divsf3(0x40a00000, 0x7f800000, 0x00000000); + status |= test__divsf3(0x40e00000, 0x80000000, 0xff800000); + status |= test__divsf3(0x40e00000, 0xff800000, 0x80000000); + status |= test__divsf3(0x41000000, 0x40000000, 0x40800000); + status |= test__divsf3(0x41100000, 0x40400000, 0x40400000); + status |= test__divsf3(0x7b000000, 0x05000000, 0x7f800000); + status |= test__divsf3(0x7e7fffff, 0x80000000, 0xff800000); + status |= test__divsf3(0x7efffffd, 0xc0000000, 0xfe7ffffd); + status |= test__divsf3(0x7effffff, 0x00000000, 0x7f800000); + status |= test__divsf3(0x7effffff, 0x7f800000, 0x00000000); + status |= test__divsf3(0x7f000000, 0x00000000, 0x7f800000); + status |= test__divsf3(0x7f000000, 0x007fffff, 0x7f800000); + status |= test__divsf3(0x7f000000, 0x3f000000, 0x7f800000); + status |= test__divsf3(0x7f000000, 0x40000000, 0x7e800000); + status |= test__divsf3(0x7f000000, 0x7f800000, 0x00000000); + status |= test__divsf3(0x7f000000, 0x80000000, 0xff800000); + status |= test__divsf3(0x7f000000, 0xbf000000, 0xff800000); + status |= test__divsf3(0x7f000000, 0xc0000000, 0xfe800000); + status |= test__divsf3(0x7f000000, 0xff800000, 0x80000000); + status |= test__divsf3(0x7f000003, 0xfe800003, 0xc0000000); + status |= test__divsf3(0x7f7ffffd, 0x40800000, 0x7e7ffffd); + status |= test__divsf3(0x7f7ffffd, 0xc0800000, 0xfe7ffffd); + status |= test__divsf3(0x7f7fffff, 0x00000001, 0x7f800000); + status |= test__divsf3(0x7f7fffff, 0x3f7fffff, 0x7f800000); + status |= test__divsf3(0x7f7fffff, 0x7e7fffff, 0x40800000); + status |= test__divsf3(0x7f7fffff, 0x7effffff, 0x40000000); + status |= test__divsf3(0x7f7fffff, 0xc0000000, 0xfeffffff); + status |= test__divsf3(0x7f7fffff, 0xfe7fffff, 0xc0800000); + status |= test__divsf3(0x7f7fffff, 0xff800000, 0x80000000); + status |= test__divsf3(0x7f800000, 0x00000000, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x00000001, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x007fffff, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x00800000, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x00ffffff, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x3f800000, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x40a00000, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x7effffff, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x7f000000, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x80000000, 0xff800000); + status |= test__divsf3(0x7f800000, 0x80000002, 0xff800000); + status |= test__divsf3(0x7f800000, 0x807fffff, 0xff800000); + status |= test__divsf3(0x7f800000, 0x80800001, 0xff800000); + status |= test__divsf3(0x7f800000, 0x81000000, 0xff800000); + status |= test__divsf3(0x7f800000, 0xc0400000, 0xff800000); + status |= test__divsf3(0x7f800000, 0xc0e00000, 0xff800000); + status |= test__divsf3(0x7f800000, 0xfe7fffff, 0xff800000); + status |= test__divsf3(0x7f800000, 0xff000000, 0xff800000); + status |= test__divsf3(0x7f800000, 0xff7fffff, 0xff800000); + status |= test__divsf3(0x80000000, 0x00000003, 0x80000000); + status |= test__divsf3(0x80000000, 0x007fffff, 0x80000000); + status |= test__divsf3(0x80000000, 0x00800001, 0x80000000); + status |= test__divsf3(0x80000000, 0x01000000, 0x80000000); + status |= test__divsf3(0x80000000, 0x40000000, 0x80000000); + status |= test__divsf3(0x80000000, 0x40c00000, 0x80000000); + status |= test__divsf3(0x80000000, 0x7e7fffff, 0x80000000); + status |= test__divsf3(0x80000000, 0x7e800000, 0x80000000); + status |= test__divsf3(0x80000000, 0x7f800000, 0x80000000); + status |= test__divsf3(0x80000000, 0x80000004, 0x00000000); + status |= test__divsf3(0x80000000, 0x807fffff, 0x00000000); + status |= test__divsf3(0x80000000, 0x80800000, 0x00000000); + status |= test__divsf3(0x80000000, 0x80ffffff, 0x00000000); + status |= test__divsf3(0x80000000, 0xc0800000, 0x00000000); + status |= test__divsf3(0x80000000, 0xc1000000, 0x00000000); + status |= test__divsf3(0x80000000, 0xfe800000, 0x00000000); + status |= test__divsf3(0x80000000, 0xfeffffff, 0x00000000); + status |= test__divsf3(0x80000000, 0xff800000, 0x00000000); + status |= test__divsf3(0x80000001, 0x3f000000, 0x80000002); + status |= test__divsf3(0x80000001, 0x40000000, 0x80000000); + status |= test__divsf3(0x80000001, 0x7f7fffff, 0x80000000); + status |= test__divsf3(0x80000001, 0xc0000000, 0x00000000); + status |= test__divsf3(0x80000001, 0xff7fffff, 0x00000000); + status |= test__divsf3(0x80000003, 0x00000000, 0xff800000); + status |= test__divsf3(0x80000003, 0x7f800000, 0x80000000); + status |= test__divsf3(0x80000004, 0x80000000, 0x7f800000); + status |= test__divsf3(0x80000004, 0xff800000, 0x00000000); + status |= test__divsf3(0x807ffff8, 0x3f7ffffe, 0x807ffff9); + status |= test__divsf3(0x807fffff, 0x00000000, 0xff800000); + status |= test__divsf3(0x807fffff, 0x7f800000, 0x80000000); + status |= test__divsf3(0x807fffff, 0x80000000, 0x7f800000); + status |= test__divsf3(0x807fffff, 0xff800000, 0x00000000); + status |= test__divsf3(0x80800000, 0x3f800001, 0x807fffff); + status |= test__divsf3(0x80800000, 0x80000000, 0x7f800000); + status |= test__divsf3(0x80800000, 0xff800000, 0x00000000); + status |= test__divsf3(0x80800001, 0x00000000, 0xff800000); + status |= test__divsf3(0x80800001, 0x7f800000, 0x80000000); + status |= test__divsf3(0x80ffffff, 0x80000000, 0x7f800000); + status |= test__divsf3(0x80ffffff, 0xff800000, 0x00000000); + status |= test__divsf3(0x81000000, 0x00000000, 0xff800000); + status |= test__divsf3(0x81000000, 0x7f800000, 0x80000000); + status |= test__divsf3(0x81000001, 0x00800001, 0xc0000000); + status |= test__divsf3(0x81000005, 0x00800005, 0xc0000000); + status |= test__divsf3(0xbf800000, 0x3f800000, 0xbf800000); + status |= test__divsf3(0xbf800000, 0xbf800000, 0x3f800000); + status |= test__divsf3(0xc0000000, 0x00000000, 0xff800000); + status |= test__divsf3(0xc0000000, 0x3f800000, 0xc0000000); + status |= test__divsf3(0xc0000000, 0x7f800000, 0x80000000); + status |= test__divsf3(0xc0000000, 0xbf800000, 0x40000000); + status |= test__divsf3(0xc0800000, 0x80000000, 0x7f800000); + status |= test__divsf3(0xc0800000, 0xff800000, 0x00000000); + status |= test__divsf3(0xc0c00000, 0x00000000, 0xff800000); + status |= test__divsf3(0xc0c00000, 0x7f800000, 0x80000000); + status |= test__divsf3(0xc0c00000, 0xc0400000, 0x40000000); + status |= test__divsf3(0xc0e00000, 0x40e00000, 0xbf800000); + status |= test__divsf3(0xc1000000, 0x40000000, 0xc0800000); + status |= test__divsf3(0xc1000000, 0x80000000, 0x7f800000); + status |= test__divsf3(0xc1000000, 0xff800000, 0x00000000); + status |= test__divsf3(0xc1100000, 0xc0400000, 0x40400000); + status |= test__divsf3(0xfe7fffff, 0x00000000, 0xff800000); + status |= test__divsf3(0xfe7fffff, 0x7f800000, 0x80000000); + status |= test__divsf3(0xfe800000, 0x00000000, 0xff800000); + status |= test__divsf3(0xfe800000, 0x7f800000, 0x80000000); + status |= test__divsf3(0xfe800000, 0x80000000, 0x7f800000); + status |= test__divsf3(0xfe800000, 0xff800000, 0x00000000); + status |= test__divsf3(0xfeffffff, 0x40000000, 0xfe7fffff); + status |= test__divsf3(0xfeffffff, 0x80000000, 0x7f800000); + status |= test__divsf3(0xff000000, 0x3f000000, 0xff800000); + status |= test__divsf3(0xff000000, 0xbf000000, 0x7f800000); + status |= test__divsf3(0xff000001, 0x7e800001, 0xc0000000); + status |= test__divsf3(0xff7ffffd, 0x40800000, 0xfe7ffffd); + status |= test__divsf3(0xff7ffffd, 0xc0800000, 0x7e7ffffd); + status |= test__divsf3(0xff7fffff, 0x7e7fffff, 0xc0800000); + status |= test__divsf3(0xff7fffff, 0xfe7fffff, 0x40800000); + status |= test__divsf3(0xff7fffff, 0xff800000, 0x00000000); + status |= test__divsf3(0xff800000, 0x00000000, 0xff800000); + status |= test__divsf3(0xff800000, 0x00000003, 0xff800000); + status |= test__divsf3(0xff800000, 0x007fffff, 0xff800000); + status |= test__divsf3(0xff800000, 0x00800001, 0xff800000); + status |= test__divsf3(0xff800000, 0x01000000, 0xff800000); + status |= test__divsf3(0xff800000, 0x40000000, 0xff800000); + status |= test__divsf3(0xff800000, 0x40c00000, 0xff800000); + status |= test__divsf3(0xff800000, 0x7e800000, 0xff800000); + status |= test__divsf3(0xff800000, 0x80000000, 0x7f800000); + status |= test__divsf3(0xff800000, 0x80000004, 0x7f800000); + status |= test__divsf3(0xff800000, 0x807fffff, 0x7f800000); + status |= test__divsf3(0xff800000, 0x80800000, 0x7f800000); + status |= test__divsf3(0xff800000, 0x80ffffff, 0x7f800000); + status |= test__divsf3(0xff800000, 0xc0800000, 0x7f800000); + status |= test__divsf3(0xff800000, 0xc1000000, 0x7f800000); + status |= test__divsf3(0xff800000, 0xfe800000, 0x7f800000); + status |= test__divsf3(0xff800000, 0xff7fffff, 0x7f800000); + status |= test__divsf3(0x2cbed883, 0x333f6113, 0x38ff4953); + status |= test__divsf3(0x3f87ffff, 0x7f001000, 0x0043f781); - // smallest normal value divided by 2.0 - if (test__divsf3(0x1.0p-126F, 2.0F, UINT32_C(0x00400000))) - return 1; - // smallest subnormal result - if (test__divsf3(0x1.0p-126F, 0x1p+23F, UINT32_C(0x00000001))) - return 1; + // Test that the result of an operation is a NaN at all when it should be. + // + // In most configurations these tests' results are checked compared using + // compareResultF, so we set all the answers to the canonical NaN 0x7fc00000, + // which causes compareResultF to accept any NaN encoding. We also use the + // same value as the input NaN in tests that have one, so that even in + // EXPECT_EXACT_RESULTS mode these tests should pass, because 0x7fc00000 is + // still the exact expected NaN. + status |= test__divsf3(0x00000000, 0x00000000, 0x7fc00000); + status |= test__divsf3(0x00000000, 0x80000000, 0x7fc00000); + status |= test__divsf3(0x7f800000, 0x7f800000, 0x7fc00000); + status |= test__divsf3(0x7f800000, 0xff800000, 0x7fc00000); + status |= test__divsf3(0x80000000, 0x00000000, 0x7fc00000); + status |= test__divsf3(0x80000000, 0x80000000, 0x7fc00000); + status |= test__divsf3(0xff800000, 0x7f800000, 0x7fc00000); + status |= test__divsf3(0xff800000, 0xff800000, 0x7fc00000); + status |= test__divsf3(0x3f800000, 0x7fc00000, 0x7fc00000); + status |= test__divsf3(0x7fc00000, 0x3f800000, 0x7fc00000); + status |= test__divsf3(0x7fc00000, 0x7fc00000, 0x7fc00000); - // some misc test cases obtained by fuzzing against h/w implementation - if (test__divsf3(-0x1.3e75e6p-108F, -0x1.cf372p+38F, UINT32_C(0x00000006))) - return 1; - if (test__divsf3(0x1.e77c54p+81F, -0x1.e77c52p-47F, UINT32_C(0xff800000))) - return 1; - if (test__divsf3(0x1.fffffep-126F, 2.F, UINT32_C(0x00800000))) - return 1; +#ifdef ARM_NAN_HANDLING + // Tests specific to the NaN handling of Arm hardware, mimicked by + // arm/divsf3.S: + // + // - a quiet NaN is distinguished by the top mantissa bit being 1 + // + // - if a signalling NaN appears in the input, the output quiet NaN is + // obtained by setting its top mantissa bit and leaving everything else + // unchanged + // + // - if both operands are signalling NaNs then the output NaN is derived + // from the first operand + // + // - if both operands are quiet NaNs then the output NaN is the first + // operand + // + // - invalid operations not involving an input NaN return the quiet + // NaN with fewest bits set, 0x7fc00000. - // test 1 / (1 - eps(0.5)) = 1 + eps(1) - if (test__divsf3(1.0F, 0x1.fffffep-1F, UINT32_C(0x3f800001))) - return 1; + status |= test__divsf3(0x00000000, 0x00000000, 0x7fc00000); + status |= test__divsf3(0x00000000, 0x7fad4be3, 0x7fed4be3); + status |= test__divsf3(0x00000000, 0x7fdf48c7, 0x7fdf48c7); + status |= test__divsf3(0x00000000, 0x80000000, 0x7fc00000); + status |= test__divsf3(0x00000001, 0x7f970eba, 0x7fd70eba); + status |= test__divsf3(0x00000001, 0x7fc35716, 0x7fc35716); + status |= test__divsf3(0x007fffff, 0x7fbf52d6, 0x7fff52d6); + status |= test__divsf3(0x007fffff, 0x7fc7a2df, 0x7fc7a2df); + status |= test__divsf3(0x3f800000, 0x7f987a85, 0x7fd87a85); + status |= test__divsf3(0x3f800000, 0x7fc50124, 0x7fc50124); + status |= test__divsf3(0x7f7fffff, 0x7f95fd6f, 0x7fd5fd6f); + status |= test__divsf3(0x7f7fffff, 0x7ffc28dc, 0x7ffc28dc); + status |= test__divsf3(0x7f800000, 0x7f800000, 0x7fc00000); + status |= test__divsf3(0x7f800000, 0x7f8dd790, 0x7fcdd790); + status |= test__divsf3(0x7f800000, 0x7fd2ef2b, 0x7fd2ef2b); + status |= test__divsf3(0x7f800000, 0xff800000, 0x7fc00000); + status |= test__divsf3(0x7f99b09d, 0x00000000, 0x7fd9b09d); + status |= test__divsf3(0x7f93541e, 0x00000001, 0x7fd3541e); + status |= test__divsf3(0x7f9fc002, 0x007fffff, 0x7fdfc002); + status |= test__divsf3(0x7fb5db77, 0x3f800000, 0x7ff5db77); + status |= test__divsf3(0x7f9f5d92, 0x7f7fffff, 0x7fdf5d92); + status |= test__divsf3(0x7fac7a36, 0x7f800000, 0x7fec7a36); + status |= test__divsf3(0x7fb42008, 0x7fb0ee07, 0x7ff42008); + status |= test__divsf3(0x7f8bd740, 0x7fc7aaf1, 0x7fcbd740); + status |= test__divsf3(0x7f9bb57b, 0x80000000, 0x7fdbb57b); + status |= test__divsf3(0x7f951a78, 0x80000001, 0x7fd51a78); + status |= test__divsf3(0x7f9ba63b, 0x807fffff, 0x7fdba63b); + status |= test__divsf3(0x7f89463c, 0xbf800000, 0x7fc9463c); + status |= test__divsf3(0x7fb63563, 0xff7fffff, 0x7ff63563); + status |= test__divsf3(0x7f90886e, 0xff800000, 0x7fd0886e); + status |= test__divsf3(0x7fe8c15e, 0x00000000, 0x7fe8c15e); + status |= test__divsf3(0x7fe915ae, 0x00000001, 0x7fe915ae); + status |= test__divsf3(0x7ffa9b42, 0x007fffff, 0x7ffa9b42); + status |= test__divsf3(0x7fdad0f5, 0x3f800000, 0x7fdad0f5); + status |= test__divsf3(0x7fd10dcb, 0x7f7fffff, 0x7fd10dcb); + status |= test__divsf3(0x7fd08e8a, 0x7f800000, 0x7fd08e8a); + status |= test__divsf3(0x7fc3a9e6, 0x7f91a816, 0x7fd1a816); + status |= test__divsf3(0x7fdb229c, 0x7fc26c68, 0x7fdb229c); + status |= test__divsf3(0x7fc9f6bb, 0x80000000, 0x7fc9f6bb); + status |= test__divsf3(0x7ffa178b, 0x80000001, 0x7ffa178b); + status |= test__divsf3(0x7fef2a0b, 0x807fffff, 0x7fef2a0b); + status |= test__divsf3(0x7ffc885b, 0xbf800000, 0x7ffc885b); + status |= test__divsf3(0x7fd26e8c, 0xff7fffff, 0x7fd26e8c); + status |= test__divsf3(0x7fc55329, 0xff800000, 0x7fc55329); + status |= test__divsf3(0x80000000, 0x00000000, 0x7fc00000); + status |= test__divsf3(0x80000000, 0x7fa833ae, 0x7fe833ae); + status |= test__divsf3(0x80000000, 0x7fc4df63, 0x7fc4df63); + status |= test__divsf3(0x80000000, 0x80000000, 0x7fc00000); + status |= test__divsf3(0x80000001, 0x7f98827d, 0x7fd8827d); + status |= test__divsf3(0x80000001, 0x7fd7acc5, 0x7fd7acc5); + status |= test__divsf3(0x807fffff, 0x7fad19c0, 0x7fed19c0); + status |= test__divsf3(0x807fffff, 0x7ffe1907, 0x7ffe1907); + status |= test__divsf3(0xbf800000, 0x7fa95487, 0x7fe95487); + status |= test__divsf3(0xbf800000, 0x7fd2bbee, 0x7fd2bbee); + status |= test__divsf3(0xff7fffff, 0x7f86ba21, 0x7fc6ba21); + status |= test__divsf3(0xff7fffff, 0x7feb00d7, 0x7feb00d7); + status |= test__divsf3(0xff800000, 0x7f800000, 0x7fc00000); + status |= test__divsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc); + status |= test__divsf3(0xff800000, 0x7fde0397, 0x7fde0397); + status |= test__divsf3(0xff800000, 0xff800000, 0x7fc00000); +#endif // ARM_NAN_HANDLING - return 0; + return status; } diff --git a/compiler-rt/test/builtins/Unit/mulsf3_test.c b/compiler-rt/test/builtins/Unit/mulsf3_test.c new file mode 100644 index 0000000000000..7dc7c8ad39c32 --- /dev/null +++ b/compiler-rt/test/builtins/Unit/mulsf3_test.c @@ -0,0 +1,616 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// RUN: %clang_builtins %s %librt -o %t && %run %t +// REQUIRES: librt_has_mulsf3 + +#include "int_lib.h" +#include +#include + +#include "fp_test.h" + +// By default this test uses compareResultF to check the returned floats, which +// accepts any returned NaN if the expected result is the canonical NaN value +// 0x7fc00000. For the Arm optimized FP implementation, which commits to a more +// detailed handling of NaNs, we tighten up the check and include some extra +// test cases specific to that NaN policy. +#if (__arm__ && !(__thumb__ && !__thumb2__)) && COMPILER_RT_ARM_OPTIMIZED_FP +# define EXPECT_EXACT_RESULTS +# define ARM_NAN_HANDLING +#endif + +// Returns: a * b +COMPILER_RT_ABI float __mulsf3(float a, float b); + +int test__mulsf3(uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) { + float a = fromRep32(a_rep), b = fromRep32(b_rep); + float x = __mulsf3(a, b); +#ifdef EXPECT_EXACT_RESULTS + int ret = toRep32(x) == expected_rep; +#else + int ret = compareResultF(x, expected_rep); +#endif + + if (ret) { + printf("error in test__mulsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32 + ", expected %08" PRIx32 "\n", + a_rep, b_rep, toRep32(x), expected_rep); + } + return ret; +} + +int main(void) { + int status = 0; + + status |= test__mulsf3(0x00000000, 0x00000000, 0x00000000); + status |= test__mulsf3(0x00000000, 0x007fffff, 0x00000000); + status |= test__mulsf3(0x00000000, 0x00ffffff, 0x00000000); + status |= test__mulsf3(0x00000000, 0x3f800000, 0x00000000); + status |= test__mulsf3(0x00000000, 0x7effffff, 0x00000000); + status |= test__mulsf3(0x00000000, 0x80000000, 0x80000000); + status |= test__mulsf3(0x00000000, 0x80000002, 0x80000000); + status |= test__mulsf3(0x00000000, 0x807fffff, 0x80000000); + status |= test__mulsf3(0x00000000, 0x80800001, 0x80000000); + status |= test__mulsf3(0x00000000, 0x81000000, 0x80000000); + status |= test__mulsf3(0x00000000, 0xc0400000, 0x80000000); + status |= test__mulsf3(0x00000000, 0xfe7fffff, 0x80000000); + status |= test__mulsf3(0x00000000, 0xff000000, 0x80000000); + status |= test__mulsf3(0x00000000, 0xff7fffff, 0x80000000); + status |= test__mulsf3(0x00000001, 0x00000000, 0x00000000); + status |= test__mulsf3(0x00000001, 0x00000001, 0x00000000); + status |= test__mulsf3(0x00000001, 0x3f000000, 0x00000000); + status |= test__mulsf3(0x00000001, 0x3f7fffff, 0x00000001); + status |= test__mulsf3(0x00000001, 0x3f800000, 0x00000001); + status |= test__mulsf3(0x00000001, 0x40000000, 0x00000002); + status |= test__mulsf3(0x00000001, 0x7f800000, 0x7f800000); + status |= test__mulsf3(0x00000001, 0xbf7fffff, 0x80000001); + status |= test__mulsf3(0x00000006, 0x3f000000, 0x00000003); + status |= test__mulsf3(0x00000006, 0xbf000000, 0x80000003); + status |= test__mulsf3(0x00000008, 0x3e000000, 0x00000001); + status |= test__mulsf3(0x007ffff7, 0x81000003, 0x80000000); + status |= test__mulsf3(0x007ffff8, 0x3f800001, 0x007ffff9); + status |= test__mulsf3(0x007ffff8, 0x3f800008, 0x00800000); + status |= test__mulsf3(0x007ffff8, 0xbf800001, 0x807ffff9); + status |= test__mulsf3(0x007ffff8, 0xbf800008, 0x80800000); + status |= test__mulsf3(0x007ffffc, 0x40000000, 0x00fffff8); + status |= test__mulsf3(0x007ffffe, 0x3f7ffffc, 0x007ffffc); + status |= test__mulsf3(0x007ffffe, 0x3f800001, 0x007fffff); + status |= test__mulsf3(0x007ffffe, 0xbf800001, 0x807fffff); + status |= test__mulsf3(0x007fffff, 0x007ffffe, 0x00000000); + status |= test__mulsf3(0x007fffff, 0x3f800001, 0x00800000); + status |= test__mulsf3(0x007fffff, 0x40000000, 0x00fffffe); + status |= test__mulsf3(0x00800000, 0x00000000, 0x00000000); + status |= test__mulsf3(0x00800000, 0x00800000, 0x00000000); + status |= test__mulsf3(0x00800000, 0x3f7ffffe, 0x007fffff); + status |= test__mulsf3(0x00800000, 0x7f800000, 0x7f800000); + status |= test__mulsf3(0x00800000, 0x80800000, 0x80000000); + status |= test__mulsf3(0x00800000, 0xc0000000, 0x81000000); + status |= test__mulsf3(0x00800001, 0x3f7ffffa, 0x007ffffe); + status |= test__mulsf3(0x00800001, 0x3f7ffffe, 0x00800000); + status |= test__mulsf3(0x00800001, 0xc0000000, 0x81000001); + status |= test__mulsf3(0x00800002, 0x3f7ffffc, 0x00800000); + status |= test__mulsf3(0x00fffff8, 0x3f000000, 0x007ffffc); + status |= test__mulsf3(0x00fffffe, 0x3f000000, 0x007fffff); + status |= test__mulsf3(0x00fffffe, 0xbf000000, 0x807fffff); + status |= test__mulsf3(0x00ffffff, 0x3f000000, 0x00800000); + status |= test__mulsf3(0x00ffffff, 0xbf000000, 0x80800000); + status |= test__mulsf3(0x3f000000, 0x80000001, 0x80000000); + status |= test__mulsf3(0x3f800000, 0x007ffffd, 0x007ffffd); + status |= test__mulsf3(0x3f800000, 0x01000003, 0x01000003); + status |= test__mulsf3(0x3f800000, 0x3f800000, 0x3f800000); + status |= test__mulsf3(0x3f800000, 0x40000000, 0x40000000); + status |= test__mulsf3(0x3f800000, 0x80000001, 0x80000001); + status |= test__mulsf3(0x3f800000, 0x80000009, 0x80000009); + status |= test__mulsf3(0x3f800001, 0x3f800001, 0x3f800002); + status |= test__mulsf3(0x3f800001, 0xbf800001, 0xbf800002); + status |= test__mulsf3(0x3f800001, 0xbf800002, 0xbf800003); + status |= test__mulsf3(0x3f800002, 0x3f800001, 0x3f800003); + status |= test__mulsf3(0x3f800002, 0x7f7ffffe, 0x7f800000); + status |= test__mulsf3(0x3f800001, 0x7f7ffffe, 0x7f800000); + status |= test__mulsf3(0x40000000, 0x00800000, 0x01000000); + status |= test__mulsf3(0x40000000, 0x00800001, 0x01000001); + status |= test__mulsf3(0x40000000, 0x3f800000, 0x40000000); + status |= test__mulsf3(0x40000000, 0x40400000, 0x40c00000); + status |= test__mulsf3(0x40000000, 0x7e800000, 0x7f000000); + status |= test__mulsf3(0x40000000, 0x7effffff, 0x7f7fffff); + status |= test__mulsf3(0x40000000, 0x807ffffd, 0x80fffffa); + status |= test__mulsf3(0x40000000, 0x80800003, 0x81000003); + status |= test__mulsf3(0x40000000, 0x80800005, 0x81000005); + status |= test__mulsf3(0x40000000, 0xbf800000, 0xc0000000); + status |= test__mulsf3(0x40000000, 0xfe7ffffd, 0xfefffffd); + status |= test__mulsf3(0x40000000, 0xfe800003, 0xff000003); + status |= test__mulsf3(0x403fffff, 0x3f7ffffd, 0x403ffffd); + status |= test__mulsf3(0x403fffff, 0x3f7ffffe, 0x403ffffe); + status |= test__mulsf3(0x403fffff, 0x3f7fffff, 0x403ffffe); + status |= test__mulsf3(0x403fffff, 0xbf7ffffd, 0xc03ffffd); + status |= test__mulsf3(0x40400000, 0x00000002, 0x00000006); + status |= test__mulsf3(0x40400000, 0x40000000, 0x40c00000); + status |= test__mulsf3(0x40400000, 0x40400000, 0x41100000); + status |= test__mulsf3(0x40400000, 0xc0000000, 0xc0c00000); + status |= test__mulsf3(0x40400001, 0x3f800001, 0x40400003); + status |= test__mulsf3(0x40400001, 0x3f800003, 0x40400006); + status |= test__mulsf3(0x40400001, 0xbf800003, 0xc0400006); + status |= test__mulsf3(0x40800000, 0x00000002, 0x00000008); + status |= test__mulsf3(0x40800000, 0x7e7fffff, 0x7f7fffff); + status |= test__mulsf3(0x40800000, 0xfe7fffff, 0xff7fffff); + status |= test__mulsf3(0x409fffff, 0x3f7fffff, 0x409ffffe); + status |= test__mulsf3(0x40a00000, 0x00000000, 0x00000000); + status |= test__mulsf3(0x40a00000, 0x7f800000, 0x7f800000); + status |= test__mulsf3(0x40a00001, 0x3f800001, 0x40a00002); + status |= test__mulsf3(0x40dfffff, 0x3f7ffffc, 0x40dffffc); + status |= test__mulsf3(0x40dfffff, 0x3f7fffff, 0x40dffffe); + status |= test__mulsf3(0x40e00000, 0x80000000, 0x80000000); + status |= test__mulsf3(0x40e00000, 0xff800000, 0xff800000); + status |= test__mulsf3(0x40e00001, 0x3f800001, 0x40e00003); + status |= test__mulsf3(0x7e7ffffd, 0x40800000, 0x7f7ffffd); + status |= test__mulsf3(0x7e7ffffd, 0xc0800000, 0xff7ffffd); + status |= test__mulsf3(0x7e800000, 0xc0000000, 0xff000000); + status |= test__mulsf3(0x7efffffd, 0xc0000008, 0xff800000); + status |= test__mulsf3(0x7effffff, 0xc0000000, 0xff7fffff); + status |= test__mulsf3(0x7f000000, 0x00000000, 0x00000000); + status |= test__mulsf3(0x7f000000, 0x40000000, 0x7f800000); + status |= test__mulsf3(0x7f000000, 0x7f000000, 0x7f800000); + status |= test__mulsf3(0x7f000000, 0x7f7ffffe, 0x7f800000); + status |= test__mulsf3(0x7f000000, 0x7f800000, 0x7f800000); + status |= test__mulsf3(0x7f000000, 0xfe800000, 0xff800000); + status |= test__mulsf3(0x7f000000, 0xfe800004, 0xff800000); + status |= test__mulsf3(0x7f000000, 0xff000000, 0xff800000); + status |= test__mulsf3(0x7f000009, 0x7f7ffffa, 0x7f800000); + status |= test__mulsf3(0x7f000009, 0xc0c00002, 0xff800000); + status |= test__mulsf3(0x7f7fffff, 0x00000000, 0x00000000); + status |= test__mulsf3(0x7f800000, 0x007fffff, 0x7f800000); + status |= test__mulsf3(0x7f800000, 0x00ffffff, 0x7f800000); + status |= test__mulsf3(0x7f800000, 0x3f800000, 0x7f800000); + status |= test__mulsf3(0x7f800000, 0x7effffff, 0x7f800000); + status |= test__mulsf3(0x7f800000, 0x7f800000, 0x7f800000); + status |= test__mulsf3(0x7f800000, 0x80000002, 0xff800000); + status |= test__mulsf3(0x7f800000, 0x807fffff, 0xff800000); + status |= test__mulsf3(0x7f800000, 0x80800001, 0xff800000); + status |= test__mulsf3(0x7f800000, 0x81000000, 0xff800000); + status |= test__mulsf3(0x7f800000, 0xc0400000, 0xff800000); + status |= test__mulsf3(0x7f800000, 0xff000000, 0xff800000); + status |= test__mulsf3(0x7f800000, 0xff7fffff, 0xff800000); + status |= test__mulsf3(0x7f800000, 0xff800000, 0xff800000); + status |= test__mulsf3(0x80000000, 0x00000000, 0x80000000); + status |= test__mulsf3(0x80000000, 0x40c00000, 0x80000000); + status |= test__mulsf3(0x80000000, 0x7f7fffff, 0x80000000); + status |= test__mulsf3(0x80000000, 0x80000000, 0x00000000); + status |= test__mulsf3(0x80000000, 0x80000004, 0x00000000); + status |= test__mulsf3(0x80000000, 0x80800000, 0x00000000); + status |= test__mulsf3(0x80000000, 0xc1000000, 0x00000000); + status |= test__mulsf3(0x80000000, 0xfe800000, 0x00000000); + status |= test__mulsf3(0x80000001, 0x00000001, 0x80000000); + status |= test__mulsf3(0x80000001, 0x40a00000, 0x80000005); + status |= test__mulsf3(0x80000002, 0x3f800000, 0x80000002); + status |= test__mulsf3(0x80000003, 0x00000000, 0x80000000); + status |= test__mulsf3(0x80000003, 0x7f800000, 0xff800000); + status |= test__mulsf3(0x80000004, 0xbf800000, 0x00000004); + status |= test__mulsf3(0x80000008, 0x3e000000, 0x80000001); + status |= test__mulsf3(0x807ffff7, 0x01000003, 0x80000000); + status |= test__mulsf3(0x807ffff7, 0x3f800001, 0x807ffff8); + status |= test__mulsf3(0x807ffffd, 0xc0000000, 0x00fffffa); + status |= test__mulsf3(0x807fffff, 0x00000000, 0x80000000); + status |= test__mulsf3(0x807fffff, 0x3f800001, 0x80800000); + status |= test__mulsf3(0x807fffff, 0x7f800000, 0xff800000); + status |= test__mulsf3(0x807fffff, 0x80000000, 0x00000000); + status |= test__mulsf3(0x807fffff, 0x807ffffe, 0x00000000); + status |= test__mulsf3(0x807fffff, 0xbf800000, 0x007fffff); + status |= test__mulsf3(0x807fffff, 0xff800000, 0x7f800000); + status |= test__mulsf3(0x80800000, 0x00800000, 0x80000000); + status |= test__mulsf3(0x80800000, 0x80800000, 0x00000000); + status |= test__mulsf3(0x80800001, 0x00000000, 0x80000000); + status |= test__mulsf3(0x80800001, 0x7f800000, 0xff800000); + status |= test__mulsf3(0x80800001, 0xbf800000, 0x00800001); + status |= test__mulsf3(0x80fffffc, 0x3f000000, 0x807ffffe); + status |= test__mulsf3(0x80fffffc, 0xbf000000, 0x007ffffe); + status |= test__mulsf3(0x80fffffe, 0x3f800000, 0x80fffffe); + status |= test__mulsf3(0x80ffffff, 0x80000000, 0x00000000); + status |= test__mulsf3(0x80ffffff, 0xff800000, 0x7f800000); + status |= test__mulsf3(0x81000000, 0x00000000, 0x80000000); + status |= test__mulsf3(0x81000000, 0x7f800000, 0xff800000); + status |= test__mulsf3(0xbf7fffff, 0xff7fffff, 0x7f7ffffe); + status |= test__mulsf3(0xbf800000, 0x00000009, 0x80000009); + status |= test__mulsf3(0xbf800000, 0x00800009, 0x80800009); + status |= test__mulsf3(0xbf800000, 0x3f800000, 0xbf800000); + status |= test__mulsf3(0xbf800000, 0x40000000, 0xc0000000); + status |= test__mulsf3(0xbf800000, 0xbf800000, 0x3f800000); + status |= test__mulsf3(0xbf800000, 0xc0000000, 0x40000000); + status |= test__mulsf3(0xbf800001, 0x3f800001, 0xbf800002); + status |= test__mulsf3(0xbf800001, 0xbf800001, 0x3f800002); + status |= test__mulsf3(0xbf800001, 0xbf800002, 0x3f800003); + status |= test__mulsf3(0xbf800002, 0x3f800001, 0xbf800003); + status |= test__mulsf3(0xbf800002, 0xbf800001, 0x3f800003); + status |= test__mulsf3(0xc0000000, 0x00000000, 0x80000000); + status |= test__mulsf3(0xc0000000, 0x007ffffd, 0x80fffffa); + status |= test__mulsf3(0xc0000000, 0x00800001, 0x81000001); + status |= test__mulsf3(0xc0000000, 0x00800005, 0x81000005); + status |= test__mulsf3(0xc0000000, 0x00800009, 0x81000009); + status |= test__mulsf3(0xc0000000, 0x40400000, 0xc0c00000); + status |= test__mulsf3(0xc0000000, 0x7e7fffff, 0xfeffffff); + status |= test__mulsf3(0xc0000000, 0x7e800001, 0xff000001); + status |= test__mulsf3(0xc0000000, 0x7f800000, 0xff800000); + status |= test__mulsf3(0xc0000000, 0xbf800000, 0x40000000); + status |= test__mulsf3(0xc0000000, 0xc0400000, 0x40c00000); + status |= test__mulsf3(0xc03ffffe, 0x7f000000, 0xff800000); + status |= test__mulsf3(0xc03fffff, 0x3f7fffff, 0xc03ffffe); + status |= test__mulsf3(0xc0400000, 0x40400000, 0xc1100000); + status |= test__mulsf3(0xc0400000, 0xc0000000, 0x40c00000); + status |= test__mulsf3(0xc0400000, 0xc0400000, 0x41100000); + status |= test__mulsf3(0xc0400000, 0xff000000, 0x7f800000); + status |= test__mulsf3(0xc0400001, 0x3f800001, 0xc0400003); + status |= test__mulsf3(0xc0800000, 0x7e7fffff, 0xff7fffff); + status |= test__mulsf3(0xc0800000, 0x80000000, 0x00000000); + status |= test__mulsf3(0xc0800000, 0xfe7fffff, 0x7f7fffff); + status |= test__mulsf3(0xc0800000, 0xff800000, 0x7f800000); + status |= test__mulsf3(0xc09ffffe, 0xff000000, 0x7f800000); + status |= test__mulsf3(0xc09fffff, 0xbf7fffff, 0x409ffffe); + status |= test__mulsf3(0xc0a00001, 0xbf800001, 0x40a00002); + status |= test__mulsf3(0xc0dffff9, 0x7f000000, 0xff800000); + status |= test__mulsf3(0xc1100000, 0x7f000000, 0xff800000); + status |= test__mulsf3(0xc1100001, 0xff000000, 0x7f800000); + status |= test__mulsf3(0xfe7ffff9, 0x7f000000, 0xff800000); + status |= test__mulsf3(0xfe7ffff9, 0xc07fffff, 0x7f7ffff8); + status |= test__mulsf3(0xfe7ffffd, 0x40800000, 0xff7ffffd); + status |= test__mulsf3(0xfe7ffffd, 0xc0800000, 0x7f7ffffd); + status |= test__mulsf3(0xfe7fffff, 0x00000000, 0x80000000); + status |= test__mulsf3(0xfe7fffff, 0x40000001, 0xff000000); + status |= test__mulsf3(0xfe7fffff, 0x7f800000, 0xff800000); + status |= test__mulsf3(0xfe800000, 0x00000000, 0x80000000); + status |= test__mulsf3(0xfe800000, 0x7f800000, 0xff800000); + status |= test__mulsf3(0xfefffff7, 0x7e800001, 0xff800000); + status |= test__mulsf3(0xfeffffff, 0x3f800001, 0xff000000); + status |= test__mulsf3(0xfeffffff, 0x80000000, 0x00000000); + status |= test__mulsf3(0xff000005, 0xff000001, 0x7f800000); + status |= test__mulsf3(0xff7ffffd, 0x7f000000, 0xff800000); + status |= test__mulsf3(0xff7ffffd, 0xc0400001, 0x7f800000); + status |= test__mulsf3(0xff7ffffd, 0xff000001, 0x7f800000); + status |= test__mulsf3(0xff7fffff, 0x80000000, 0x00000000); + status |= test__mulsf3(0xff7fffff, 0xff7fffff, 0x7f800000); + status |= test__mulsf3(0xff7fffff, 0xff800000, 0x7f800000); + status |= test__mulsf3(0xff800000, 0x40c00000, 0xff800000); + status |= test__mulsf3(0xff800000, 0x7f800000, 0xff800000); + status |= test__mulsf3(0xff800000, 0x80000004, 0x7f800000); + status |= test__mulsf3(0xff800000, 0x80800000, 0x7f800000); + status |= test__mulsf3(0xff800000, 0xc1000000, 0x7f800000); + status |= test__mulsf3(0xff800000, 0xfe800000, 0x7f800000); + status |= test__mulsf3(0xff800000, 0xff800000, 0x7f800000); + status |= test__mulsf3(0x3089705f, 0x0ef36390, 0x0041558f); + status |= test__mulsf3(0x3089705f, 0x0e936390, 0x0027907d); + status |= test__mulsf3(0x3109705f, 0x0ef36390, 0x0082ab1e); + status |= test__mulsf3(0x3109705f, 0x0e936390, 0x004f20fa); + status |= test__mulsf3(0x3189705f, 0x0ef36390, 0x0102ab1e); + status |= test__mulsf3(0x3189705f, 0x0e936390, 0x009e41f5); + status |= test__mulsf3(0xb089705f, 0x0ef36390, 0x8041558f); + status |= test__mulsf3(0xb089705f, 0x0e936390, 0x8027907d); + status |= test__mulsf3(0xb109705f, 0x0ef36390, 0x8082ab1e); + status |= test__mulsf3(0xb109705f, 0x0e936390, 0x804f20fa); + status |= test__mulsf3(0xb189705f, 0x0ef36390, 0x8102ab1e); + status |= test__mulsf3(0xb189705f, 0x0e936390, 0x809e41f5); + status |= test__mulsf3(0x3089705f, 0x8ef36390, 0x8041558f); + status |= test__mulsf3(0x3089705f, 0x8e936390, 0x8027907d); + status |= test__mulsf3(0x3109705f, 0x8ef36390, 0x8082ab1e); + status |= test__mulsf3(0x3109705f, 0x8e936390, 0x804f20fa); + status |= test__mulsf3(0x3189705f, 0x8ef36390, 0x8102ab1e); + status |= test__mulsf3(0x3189705f, 0x8e936390, 0x809e41f5); + status |= test__mulsf3(0xb089705f, 0x8ef36390, 0x0041558f); + status |= test__mulsf3(0xb089705f, 0x8e936390, 0x0027907d); + status |= test__mulsf3(0xb109705f, 0x8ef36390, 0x0082ab1e); + status |= test__mulsf3(0xb109705f, 0x8e936390, 0x004f20fa); + status |= test__mulsf3(0xb189705f, 0x8ef36390, 0x0102ab1e); + status |= test__mulsf3(0xb189705f, 0x8e936390, 0x009e41f5); + status |= test__mulsf3(0x1f800001, 0x1fc00000, 0x00300000); + status |= test__mulsf3(0x1f800003, 0x1fc00000, 0x00300001); + status |= test__mulsf3(0x1f800001, 0x1fc00800, 0x00300200); + status |= test__mulsf3(0x1f800003, 0x1fc00800, 0x00300201); + status |= test__mulsf3(0x36e4588a, 0x29b47cbd, 0x2120fd85); + status |= test__mulsf3(0x3fea3b26, 0x3f400000, 0x3fafac5c); + status |= test__mulsf3(0x6fea3b26, 0x4f400000, 0x7f800000); + status |= test__mulsf3(0x20ea3b26, 0x1ec00000, 0x0057d62e); + status |= test__mulsf3(0x3f8f11bb, 0x3fc00000, 0x3fd69a98); + status |= test__mulsf3(0x6f8f11bb, 0x4fc00000, 0x7f800000); + status |= test__mulsf3(0x208f11bb, 0x1f400000, 0x006b4d4c); + status |= test__mulsf3(0x3f8f11bb, 0x3f800000, 0x3f8f11bb); + status |= test__mulsf3(0x6f8f11bb, 0x4f800000, 0x7f800000); + status |= test__mulsf3(0x208f11bb, 0x1f000000, 0x004788de); + status |= test__mulsf3(0x3f8f11bb, 0x3fd7f48d, 0x3ff1611f); + status |= test__mulsf3(0x6f8f11bb, 0x4fd7f48d, 0x7f800000); + status |= test__mulsf3(0x208f11bb, 0x1f57f48d, 0x0078b090); + status |= test__mulsf3(0x3f8f11bb, 0x3fa80b73, 0x3fbbd412); + status |= test__mulsf3(0x6f8f11bb, 0x4fa80b73, 0x7f800000); + status |= test__mulsf3(0x208f11bb, 0x1f280b73, 0x005dea09); + status |= test__mulsf3(0x3f8f11bb, 0x3f97f48d, 0x3fa9d842); + status |= test__mulsf3(0x6f8f11bb, 0x4f97f48d, 0x7f800000); + status |= test__mulsf3(0x208f11bb, 0x1f17f48d, 0x0054ec21); + status |= test__mulsf3(0x3f8f11bb, 0x3f680b73, 0x3f81ae78); + status |= test__mulsf3(0x6f8f11bb, 0x4f680b73, 0x7f800000); + status |= test__mulsf3(0x208f11bb, 0x1ee80b73, 0x0040d73c); + status |= test__mulsf3(0x3fff5dd8, 0x3f600000, 0x3fdf721d); + status |= test__mulsf3(0x6fff5dd8, 0x4f600000, 0x7f800000); + status |= test__mulsf3(0x20ff5dd8, 0x1ee00000, 0x006fb90e); + status |= test__mulsf3(0x3fff5dd8, 0x3f100000, 0x3f8fa4ca); + status |= test__mulsf3(0x6fff5dd8, 0x4f100000, 0x7f800000); + status |= test__mulsf3(0x20ff5dd8, 0x1e900000, 0x0047d265); + status |= test__mulsf3(0x3fffe96b, 0x3f7efb43, 0x3ffee4c5); + status |= test__mulsf3(0x6fffe96b, 0x4f7efb43, 0x7f800000); + status |= test__mulsf3(0x20ffe96b, 0x1efefb43, 0x007f7263); + status |= test__mulsf3(0x3fffe96b, 0x3f0104bd, 0x3f80f95b); + status |= test__mulsf3(0x6fffe96b, 0x4f0104bd, 0x7f800000); + status |= test__mulsf3(0x20ffe96b, 0x1e8104bd, 0x00407cae); + status |= test__mulsf3(0x3f8fbbb7, 0x3fa6edf9, 0x3fbb72aa); + status |= test__mulsf3(0x6f8fbbb7, 0x4fa6edf9, 0x7f800000); + status |= test__mulsf3(0x208fbbb7, 0x1f26edf9, 0x005db955); + status |= test__mulsf3(0x3f8fbbb7, 0x3fd91207, 0x3ff3c07b); + status |= test__mulsf3(0x6f8fbbb7, 0x4fd91207, 0x7f800000); + status |= test__mulsf3(0x208fbbb7, 0x1f591207, 0x0079e03d); + status |= test__mulsf3(0x3f8fbbb7, 0x3f991207, 0x3fabe29f); + status |= test__mulsf3(0x6f8fbbb7, 0x4f991207, 0x7f800000); + status |= test__mulsf3(0x208fbbb7, 0x1f191207, 0x0055f150); + status |= test__mulsf3(0x3f8fbbb7, 0x3f66edf9, 0x3f81a843); + status |= test__mulsf3(0x6f8fbbb7, 0x4f66edf9, 0x7f800000); + status |= test__mulsf3(0x208fbbb7, 0x1ee6edf9, 0x0040d421); + status |= test__mulsf3(0x3fdb62f3, 0x3f7879c5, 0x3fd4f036); + status |= test__mulsf3(0x6fdb62f3, 0x4f7879c5, 0x7f800000); + status |= test__mulsf3(0x20db62f3, 0x1ef879c5, 0x006a781b); + status |= test__mulsf3(0x3faaea45, 0x3f8b6773, 0x3fba2489); + status |= test__mulsf3(0x6faaea45, 0x4f8b6773, 0x7f800000); + status |= test__mulsf3(0x20aaea45, 0x1f0b6773, 0x005d1244); + status |= test__mulsf3(0x3fafa7ec, 0x3f900000, 0x3fc59cea); + status |= test__mulsf3(0x6fafa7ec, 0x4f900000, 0x7f800000); + status |= test__mulsf3(0x20afa7ec, 0x1f100000, 0x0062ce75); + status |= test__mulsf3(0x3fcf8c8d, 0x3f271645, 0x3f8776be); + status |= test__mulsf3(0x6fcf8c8d, 0x4f271645, 0x7f800000); + status |= test__mulsf3(0x20cf8c8d, 0x1ea71645, 0x0043bb5f); + status |= test__mulsf3(0x3fc173ef, 0x3f901b0f, 0x3fd9cb52); + status |= test__mulsf3(0x6fc173ef, 0x4f901b0f, 0x7f800000); + status |= test__mulsf3(0x20c173ef, 0x1f101b0f, 0x006ce5a9); + status |= test__mulsf3(0x3fb48d33, 0x3f4a35fb, 0x3f8e9d7d); + status |= test__mulsf3(0x6fb48d33, 0x4f4a35fb, 0x7f800000); + status |= test__mulsf3(0x20b48d33, 0x1eca35fb, 0x00474ebe); + status |= test__mulsf3(0x3fc6f87b, 0x3f65d94d, 0x3fb2a52a); + status |= test__mulsf3(0x6fc6f87b, 0x4f65d94d, 0x7f800000); + status |= test__mulsf3(0x20c6f87b, 0x1ee5d94d, 0x00595295); + status |= test__mulsf3(0x3f860ae7, 0x3f969729, 0x3f9db312); + status |= test__mulsf3(0x6f860ae7, 0x4f969729, 0x7f800000); + status |= test__mulsf3(0x20860ae7, 0x1f169729, 0x004ed989); + status |= test__mulsf3(0x3f860ae7, 0x3fc00000, 0x3fc9105a); + status |= test__mulsf3(0x6f860ae7, 0x4fc00000, 0x7f800000); + status |= test__mulsf3(0x20860ae7, 0x1f400000, 0x0064882d); + status |= test__mulsf3(0x3f860ae7, 0x3fe968d7, 0x3ff46da3); + status |= test__mulsf3(0x6f860ae7, 0x4fe968d7, 0x7f800000); + status |= test__mulsf3(0x20860ae7, 0x1f6968d7, 0x007a36d1); + status |= test__mulsf3(0x3f860ae7, 0x3f800000, 0x3f860ae7); + status |= test__mulsf3(0x6f860ae7, 0x4f800000, 0x7f800000); + status |= test__mulsf3(0x20860ae7, 0x1f000000, 0x00430574); + status |= test__mulsf3(0x3f860ae7, 0x3fa968d7, 0x3fb1682f); + status |= test__mulsf3(0x6f860ae7, 0x4fa968d7, 0x7f800000); + status |= test__mulsf3(0x20860ae7, 0x1f2968d7, 0x0058b418); + status |= test__mulsf3(0x3f860ae7, 0x3fd69729, 0x3fe0b886); + status |= test__mulsf3(0x6f860ae7, 0x4fd69729, 0x7f800000); + status |= test__mulsf3(0x20860ae7, 0x1f569729, 0x00705c43); + status |= test__mulsf3(0x3f9aecdd, 0x3fb14b75, 0x3fd696de); + status |= test__mulsf3(0x6f9aecdd, 0x4fb14b75, 0x7f800000); + status |= test__mulsf3(0x209aecdd, 0x1f314b75, 0x006b4b6f); + status |= test__mulsf3(0x3f9aecdd, 0x3fceb48b, 0x3ffa2fb9); + status |= test__mulsf3(0x6f9aecdd, 0x4fceb48b, 0x7f800000); + status |= test__mulsf3(0x209aecdd, 0x1f4eb48b, 0x007d17dc); + status |= test__mulsf3(0x3f9aecdd, 0x3fc00000, 0x3fe8634c); + status |= test__mulsf3(0x6f9aecdd, 0x4fc00000, 0x7f800000); + status |= test__mulsf3(0x209aecdd, 0x1f400000, 0x007431a6); + status |= test__mulsf3(0x3fd65dc6, 0x3f400000, 0x3fa0c654); + status |= test__mulsf3(0x6fd65dc6, 0x4f400000, 0x7f800000); + status |= test__mulsf3(0x20d65dc6, 0x1ec00000, 0x0050632a); + status |= test__mulsf3(0x3feecf03, 0x3f5f93ab, 0x3fd09014); + status |= test__mulsf3(0x6feecf03, 0x4f5f93ab, 0x7f800000); + status |= test__mulsf3(0x20eecf03, 0x1edf93ab, 0x0068480a); + status |= test__mulsf3(0x3feecf03, 0x3f206c55, 0x3f95a670); + status |= test__mulsf3(0x6feecf03, 0x4f206c55, 0x7f800000); + status |= test__mulsf3(0x20eecf03, 0x1ea06c55, 0x004ad338); + status |= test__mulsf3(0x3f98feed, 0x3f60f11b, 0x3f866f27); + status |= test__mulsf3(0x6f98feed, 0x4f60f11b, 0x7f800000); + status |= test__mulsf3(0x2098feed, 0x1ee0f11b, 0x00433794); + status |= test__mulsf3(0x3f9a1b9d, 0x3f9c42b5, 0x3fbc21f8); + status |= test__mulsf3(0x6f9a1b9d, 0x4f9c42b5, 0x7f800000); + status |= test__mulsf3(0x209a1b9d, 0x1f1c42b5, 0x005e10fc); + status |= test__mulsf3(0x3f9a1b9d, 0x3f5c42b5, 0x3f8497e3); + status |= test__mulsf3(0x6f9a1b9d, 0x4f5c42b5, 0x7f800000); + status |= test__mulsf3(0x209a1b9d, 0x1edc42b5, 0x00424bf2); + status |= test__mulsf3(0x3f947044, 0x3f600000, 0x3f81e23c); + status |= test__mulsf3(0x6f947044, 0x4f600000, 0x7f800000); + status |= test__mulsf3(0x20947044, 0x1ee00000, 0x0040f11e); + status |= test__mulsf3(0x3fa3fb77, 0x3f6eb1b9, 0x3f98e5a0); + status |= test__mulsf3(0x6fa3fb77, 0x4f6eb1b9, 0x7f800000); + status |= test__mulsf3(0x20a3fb77, 0x1eeeb1b9, 0x004c72d0); + status |= test__mulsf3(0x3fb291df, 0x3f466a1f, 0x3f8a66d9); + status |= test__mulsf3(0x6fb291df, 0x4f466a1f, 0x7f800000); + status |= test__mulsf3(0x20b291df, 0x1ec66a1f, 0x0045336c); + status |= test__mulsf3(0x3fde13d5, 0x3f6b7283, 0x3fcc3f8b); + status |= test__mulsf3(0x6fde13d5, 0x4f6b7283, 0x7f800000); + status |= test__mulsf3(0x20de13d5, 0x1eeb7283, 0x00661fc5); + status |= test__mulsf3(0x3fd5b211, 0x3f80810f, 0x3fd68987); + status |= test__mulsf3(0x6fd5b211, 0x4f80810f, 0x7f800000); + status |= test__mulsf3(0x20d5b211, 0x1f00810f, 0x006b44c4); + status |= test__mulsf3(0x3fd5b211, 0x3f3f7ef1, 0x3f9fd9d2); + status |= test__mulsf3(0x6fd5b211, 0x4f3f7ef1, 0x7f800000); + status |= test__mulsf3(0x20d5b211, 0x1ebf7ef1, 0x004fece9); + status |= test__mulsf3(0x3fadfbc4, 0x3f400000, 0x3f827cd3); + status |= test__mulsf3(0x6fadfbc4, 0x4f400000, 0x7f800000); + status |= test__mulsf3(0x20adfbc4, 0x1ec00000, 0x00413e6a); + status |= test__mulsf3(0x3fd0ef03, 0x3f800000, 0x3fd0ef03); + status |= test__mulsf3(0x6fd0ef03, 0x4f800000, 0x7f800000); + status |= test__mulsf3(0x20d0ef03, 0x1f000000, 0x00687782); + status |= test__mulsf3(0x3fd0ef03, 0x3f8673ab, 0x3fdb7705); + status |= test__mulsf3(0x6fd0ef03, 0x4f8673ab, 0x7f800000); + status |= test__mulsf3(0x20d0ef03, 0x1f0673ab, 0x006dbb83); + status |= test__mulsf3(0x3fd0ef03, 0x3f798c55, 0x3fcbab02); + status |= test__mulsf3(0x6fd0ef03, 0x4f798c55, 0x7f800000); + status |= test__mulsf3(0x20d0ef03, 0x1ef98c55, 0x0065d581); + status |= test__mulsf3(0x3fdd1181, 0x3f8ad17f, 0x3fefc0b1); + status |= test__mulsf3(0x6fdd1181, 0x4f8ad17f, 0x7f800000); + status |= test__mulsf3(0x20dd1181, 0x1f0ad17f, 0x0077e058); + status |= test__mulsf3(0x3fdd1181, 0x3f752e81, 0x3fd3b9e9); + status |= test__mulsf3(0x6fdd1181, 0x4f752e81, 0x7f800000); + status |= test__mulsf3(0x20dd1181, 0x1ef52e81, 0x0069dcf5); + status |= test__mulsf3(0x3f92efc6, 0x3fa00000, 0x3fb7abb8); + status |= test__mulsf3(0x6f92efc6, 0x4fa00000, 0x7f800000); + status |= test__mulsf3(0x2092efc6, 0x1f200000, 0x005bd5dc); + status |= test__mulsf3(0x3fdcefe6, 0x3f400000, 0x3fa5b3ec); + status |= test__mulsf3(0x6fdcefe6, 0x4f400000, 0x7f800000); + status |= test__mulsf3(0x20dcefe6, 0x1ec00000, 0x0052d9f6); + status |= test__mulsf3(0x3fad6507, 0x3fa2f8b7, 0x3fdcc4c9); + status |= test__mulsf3(0x6fad6507, 0x4fa2f8b7, 0x7f800000); + status |= test__mulsf3(0x20ad6507, 0x1f22f8b7, 0x006e6264); + status |= test__mulsf3(0x3fad6507, 0x3f62f8b7, 0x3f99bba6); + status |= test__mulsf3(0x6fad6507, 0x4f62f8b7, 0x7f800000); + status |= test__mulsf3(0x20ad6507, 0x1ee2f8b7, 0x004cddd3); + status |= test__mulsf3(0x3fbfde6b, 0x3f8721bd, 0x3fca8f27); + status |= test__mulsf3(0x6fbfde6b, 0x4f8721bd, 0x7f800000); + status |= test__mulsf3(0x20bfde6b, 0x1f0721bd, 0x00654794); + status |= test__mulsf3(0x3fbfde6b, 0x3f4721bd, 0x3f953f2e); + status |= test__mulsf3(0x6fbfde6b, 0x4f4721bd, 0x7f800000); + status |= test__mulsf3(0x20bfde6b, 0x1ec721bd, 0x004a9f97); + status |= test__mulsf3(0x3ff40db4, 0x3f400000, 0x3fb70a47); + status |= test__mulsf3(0x6ff40db4, 0x4f400000, 0x7f800000); + status |= test__mulsf3(0x20f40db4, 0x1ec00000, 0x005b8524); + status |= test__mulsf3(0x3ff40db4, 0x3f600000, 0x3fd58bfe); + status |= test__mulsf3(0x6ff40db4, 0x4f600000, 0x7f800000); + status |= test__mulsf3(0x20f40db4, 0x1ee00000, 0x006ac5ff); + status |= test__mulsf3(0x3f9e20d3, 0x3f90c8a5, 0x3fb2dccc); + status |= test__mulsf3(0x6f9e20d3, 0x4f90c8a5, 0x7f800000); + status |= test__mulsf3(0x209e20d3, 0x1f10c8a5, 0x00596e66); + status |= test__mulsf3(0x3f9e20d3, 0x3fc00000, 0x3fed313c); + status |= test__mulsf3(0x6f9e20d3, 0x4fc00000, 0x7f800000); + status |= test__mulsf3(0x209e20d3, 0x1f400000, 0x0076989e); + status |= test__mulsf3(0x3f9e20d3, 0x3f50c8a5, 0x3f80f69b); + status |= test__mulsf3(0x6f9e20d3, 0x4f50c8a5, 0x7f800000); + status |= test__mulsf3(0x209e20d3, 0x1ed0c8a5, 0x00407b4d); + status |= test__mulsf3(0x3f82e641, 0x3f8fd63f, 0x3f931856); + status |= test__mulsf3(0x6f82e641, 0x4f8fd63f, 0x7f800000); + status |= test__mulsf3(0x2082e641, 0x1f0fd63f, 0x00498c2b); + status |= test__mulsf3(0x3f9a1901, 0x3f96e701, 0x3fb5ab68); + status |= test__mulsf3(0x6f9a1901, 0x4f96e701, 0x7f800000); + status |= test__mulsf3(0x209a1901, 0x1f16e701, 0x005ad5b4); + status |= test__mulsf3(0x3fa21aa1, 0x3f7c4961, 0x3f9fc0ae); + status |= test__mulsf3(0x6fa21aa1, 0x4f7c4961, 0x7f800000); + status |= test__mulsf3(0x20a21aa1, 0x1efc4961, 0x004fe057); + status |= test__mulsf3(0x3fcd0767, 0x3f782457, 0x3fc6bc47); + status |= test__mulsf3(0x6fcd0767, 0x4f782457, 0x7f800000); + status |= test__mulsf3(0x20cd0767, 0x1ef82457, 0x00635e23); + status |= test__mulsf3(0x3fb875e1, 0x3f968e21, 0x3fd8f6f6); + status |= test__mulsf3(0x6fb875e1, 0x4f968e21, 0x7f800000); + status |= test__mulsf3(0x20b875e1, 0x1f168e21, 0x006c7b7b); + status |= test__mulsf3(0x3fc2f0d7, 0x3f5efd19, 0x3fa9cd95); + status |= test__mulsf3(0x6fc2f0d7, 0x4f5efd19, 0x7f800000); + status |= test__mulsf3(0x20c2f0d7, 0x1edefd19, 0x0054e6cb); + status |= test__mulsf3(0x7f7ffffe, 0x3f800001, 0x7f800000); + status |= test__mulsf3(0x00000003, 0xc00fffff, 0x80000007); + status |= test__mulsf3(0x00000003, 0x400fffff, 0x00000007); + status |= test__mulsf3(0x80000003, 0xc00fffff, 0x00000007); + status |= test__mulsf3(0x80000003, 0x400fffff, 0x80000007); + status |= test__mulsf3(0x00000003, 0xc00ffffd, 0x80000007); + status |= test__mulsf3(0x00000003, 0x400ffffd, 0x00000007); + status |= test__mulsf3(0x80000003, 0xc00ffffd, 0x00000007); + status |= test__mulsf3(0x80000003, 0x400ffffd, 0x80000007); + status |= test__mulsf3(0x3e00007f, 0x017c0000, 0x003f003f); + status |= test__mulsf3(0xcf7fff00, 0xc0ffff00, 0x50fffe00); + status |= test__mulsf3(0x3fdf7f00, 0x3fffff00, 0x405f7e21); + status |= test__mulsf3(0x19b92144, 0x1a310000, 0x00000001); + status |= test__mulsf3(0x19ffc008, 0x1a002004, 0x00000001); + status |= test__mulsf3(0x7f7ffff0, 0xc0000008, 0xff800000); + + // Test that the result of an operation is a NaN at all when it should be. + // + // In most configurations these tests' results are checked compared using + // compareResultF, so we set all the answers to the canonical NaN 0x7fc00000, + // which causes compareResultF to accept any NaN encoding. We also use the + // same value as the input NaN in tests that have one, so that even in + // EXPECT_EXACT_RESULTS mode these tests should pass, because 0x7fc00000 is + // still the exact expected NaN. + status |= test__mulsf3(0x7f800000, 0x00000000, 0x7fc00000); + status |= test__mulsf3(0x7f800000, 0x80000000, 0x7fc00000); + status |= test__mulsf3(0x80000000, 0x7f800000, 0x7fc00000); + status |= test__mulsf3(0x80000000, 0xff800000, 0x7fc00000); + status |= test__mulsf3(0x3f800000, 0x7fc00000, 0x7fc00000); + status |= test__mulsf3(0x7fc00000, 0x3f800000, 0x7fc00000); + status |= test__mulsf3(0x7fc00000, 0x7fc00000, 0x7fc00000); + +#ifdef ARM_NAN_HANDLING + // Tests specific to the NaN handling of Arm hardware, mimicked by + // arm/mulsf3.S: + // + // - a quiet NaN is distinguished by the top mantissa bit being 1 + // + // - if a signalling NaN appears in the input, the output quiet NaN is + // obtained by setting its top mantissa bit and leaving everything else + // unchanged + // + // - if both operands are signalling NaNs then the output NaN is derived + // from the first operand + // + // - if both operands are quiet NaNs then the output NaN is the first + // operand + // + // - invalid operations not involving an input NaN return the quiet + // NaN with fewest bits set, 0x7fc00000. + + status |= test__mulsf3(0x00000000, 0x7fad4be3, 0x7fed4be3); + status |= test__mulsf3(0x00000000, 0x7fdf48c7, 0x7fdf48c7); + status |= test__mulsf3(0x00000001, 0x7f970eba, 0x7fd70eba); + status |= test__mulsf3(0x00000001, 0x7fc35716, 0x7fc35716); + status |= test__mulsf3(0x007fffff, 0x7fbf52d6, 0x7fff52d6); + status |= test__mulsf3(0x007fffff, 0x7fc7a2df, 0x7fc7a2df); + status |= test__mulsf3(0x3f800000, 0x7f987a85, 0x7fd87a85); + status |= test__mulsf3(0x3f800000, 0x7fc50124, 0x7fc50124); + status |= test__mulsf3(0x7f7fffff, 0x7f95fd6f, 0x7fd5fd6f); + status |= test__mulsf3(0x7f7fffff, 0x7ffc28dc, 0x7ffc28dc); + status |= test__mulsf3(0x7f800000, 0x00000000, 0x7fc00000); + status |= test__mulsf3(0x7f800000, 0x7f8dd790, 0x7fcdd790); + status |= test__mulsf3(0x7f800000, 0x7fd2ef2b, 0x7fd2ef2b); + status |= test__mulsf3(0x7f800000, 0x80000000, 0x7fc00000); + status |= test__mulsf3(0x7f99b09d, 0x00000000, 0x7fd9b09d); + status |= test__mulsf3(0x7f93541e, 0x00000001, 0x7fd3541e); + status |= test__mulsf3(0x7f9fc002, 0x007fffff, 0x7fdfc002); + status |= test__mulsf3(0x7fb5db77, 0x3f800000, 0x7ff5db77); + status |= test__mulsf3(0x7f9f5d92, 0x7f7fffff, 0x7fdf5d92); + status |= test__mulsf3(0x7fac7a36, 0x7f800000, 0x7fec7a36); + status |= test__mulsf3(0x7fb42008, 0x7fb0ee07, 0x7ff42008); + status |= test__mulsf3(0x7f8bd740, 0x7fc7aaf1, 0x7fcbd740); + status |= test__mulsf3(0x7f9bb57b, 0x80000000, 0x7fdbb57b); + status |= test__mulsf3(0x7f951a78, 0x80000001, 0x7fd51a78); + status |= test__mulsf3(0x7f9ba63b, 0x807fffff, 0x7fdba63b); + status |= test__mulsf3(0x7f89463c, 0xbf800000, 0x7fc9463c); + status |= test__mulsf3(0x7fb63563, 0xff7fffff, 0x7ff63563); + status |= test__mulsf3(0x7f90886e, 0xff800000, 0x7fd0886e); + status |= test__mulsf3(0x7fe8c15e, 0x00000000, 0x7fe8c15e); + status |= test__mulsf3(0x7fe915ae, 0x00000001, 0x7fe915ae); + status |= test__mulsf3(0x7ffa9b42, 0x007fffff, 0x7ffa9b42); + status |= test__mulsf3(0x7fdad0f5, 0x3f800000, 0x7fdad0f5); + status |= test__mulsf3(0x7fd10dcb, 0x7f7fffff, 0x7fd10dcb); + status |= test__mulsf3(0x7fd08e8a, 0x7f800000, 0x7fd08e8a); + status |= test__mulsf3(0x7fc3a9e6, 0x7f91a816, 0x7fd1a816); + status |= test__mulsf3(0x7fdb229c, 0x7fc26c68, 0x7fdb229c); + status |= test__mulsf3(0x7fc9f6bb, 0x80000000, 0x7fc9f6bb); + status |= test__mulsf3(0x7ffa178b, 0x80000001, 0x7ffa178b); + status |= test__mulsf3(0x7fef2a0b, 0x807fffff, 0x7fef2a0b); + status |= test__mulsf3(0x7ffc885b, 0xbf800000, 0x7ffc885b); + status |= test__mulsf3(0x7fd26e8c, 0xff7fffff, 0x7fd26e8c); + status |= test__mulsf3(0x7fc55329, 0xff800000, 0x7fc55329); + status |= test__mulsf3(0x80000000, 0x7f800000, 0x7fc00000); + status |= test__mulsf3(0x80000000, 0x7fa833ae, 0x7fe833ae); + status |= test__mulsf3(0x80000000, 0x7fc4df63, 0x7fc4df63); + status |= test__mulsf3(0x80000000, 0xff800000, 0x7fc00000); + status |= test__mulsf3(0x80000001, 0x7f98827d, 0x7fd8827d); + status |= test__mulsf3(0x80000001, 0x7fd7acc5, 0x7fd7acc5); + status |= test__mulsf3(0x807fffff, 0x7fad19c0, 0x7fed19c0); + status |= test__mulsf3(0x807fffff, 0x7ffe1907, 0x7ffe1907); + status |= test__mulsf3(0xbf800000, 0x7fa95487, 0x7fe95487); + status |= test__mulsf3(0xbf800000, 0x7fd2bbee, 0x7fd2bbee); + status |= test__mulsf3(0xff7fffff, 0x7f86ba21, 0x7fc6ba21); + status |= test__mulsf3(0xff7fffff, 0x7feb00d7, 0x7feb00d7); + status |= test__mulsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc); + status |= test__mulsf3(0xff800000, 0x7fde0397, 0x7fde0397); +#endif // ARM_NAN_HANDLING + + return status; +} diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DAP.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DAP.py index a849990678d42..792e0be629fc4 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DAP.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DAP.py @@ -335,6 +335,7 @@ def send_message(self, payload: dict) -> int: self._proc.stdin.flush() return self.seq + @staticmethod def _handle_message( message: dict, debugger_state: DAPDebuggerState, logger: Logger ): @@ -419,6 +420,7 @@ def _handle_message( request_seq = message["request_seq"] debugger_state.set_response(request_seq, message) + @staticmethod def _colorize_dap_message(message: dict) -> dict: colorized_message = copy.deepcopy(message) if colorized_message["type"] == "event": @@ -432,6 +434,7 @@ def _colorize_dap_message(message: dict) -> dict: colorized_message["command"] = f"{colorized_message['command']}" return colorized_message + @staticmethod def _read_dap_output( proc: subprocess.Popen, debugger_state: DAPDebuggerState, @@ -454,6 +457,7 @@ def _read_dap_output( DAP._handle_message(message, debugger_state, logger) buffer = rest[content_length:] + @staticmethod def _read_dap_err(proc: subprocess.Popen, logger: Logger): while True: err: bytes = proc.stderr.readline() @@ -930,10 +934,16 @@ def evaluate_expression(self, expression, frame_idx=0) -> ValueIR: ) ) eval_response = self._await_response(eval_req_id) + result: str = "" if not eval_response["success"]: - result: str = eval_response["message"] + if eval_response["body"].get("error", None): + result = eval_response["body"]["error"]["format"] + elif eval_response["message"]: + result = eval_response["message"] + else: + result = "" else: - result: str = eval_response["body"]["result"] + result = eval_response["body"]["result"] type_str = eval_response["body"].get("type") return self._evaluate_result_value(expression, result, type_str) diff --git a/cross-project-tests/dtlto/dtlto-cache.test b/cross-project-tests/dtlto/dtlto-cache.test index b98d4dbb433bb..5dd67a50ab2c3 100644 --- a/cross-project-tests/dtlto/dtlto-cache.test +++ b/cross-project-tests/dtlto/dtlto-cache.test @@ -17,7 +17,7 @@ RUN: -Wl,--thinlto-cache-dir=cache.dir \ RUN: -Wl,--save-temps # Check that there are two backend compilation jobs occurred. -RUN: grep -wo args populate1.*.dist-file.json | wc -l | grep -qx 3 +RUN: grep -wo args populate1.*.dist-file.json | wc -l | grep -qx "\s*3" RUN: ls cache.dir/llvmcache.timestamp RUN: ls cache.dir | count 3 @@ -32,7 +32,7 @@ RUN: -Wl,--thinlto-cache-dir=cache.dir \ RUN: -Wl,--save-temps # Check that there are no backend compilation jobs occurred. -RUN: grep -wo args populate2.*.dist-file.json | wc -l | grep -qx 1 +RUN: grep -wo args populate2.*.dist-file.json | wc -l | grep -qx "\s*1" RUN: ls cache.dir | count 3 RUN: %clang -O0 --target=x86_64-linux-gnu -flto=thin -c foo.c -o foo.O0.o @@ -52,7 +52,7 @@ RUN: -Wl,--thinlto-cache-dir=cache.dir \ RUN: -Wl,--save-temps # Check that there are two new backend compilation jobs occurred. -RUN: grep -wo args populate3.*.dist-file.json | wc -l | grep -qx 3 +RUN: grep -wo args populate3.*.dist-file.json | wc -l | grep -qx "\s*3" RUN: ls cache.dir | count 5 RUN: %clang -O2 --target=x86_64-linux-gnu -flto=thin -c main-partial.c @@ -69,7 +69,7 @@ RUN: -Wl,--thinlto-cache-dir=cache.dir \ RUN: -Wl,--save-temps # Check that there is one new backend compilation jobs occurred. -RUN: grep -wo args main-partial.*.dist-file.json | wc -l | grep -qx 2 +RUN: grep -wo args main-partial.*.dist-file.json | wc -l | grep -qx "\s*2" RUN: ls cache.dir | count 6 #--- foo.c diff --git a/cross-project-tests/dtlto/dtlto-thinlto-cache.test b/cross-project-tests/dtlto/dtlto-thinlto-cache.test index c177112e2dbbd..9b0ca228480d1 100644 --- a/cross-project-tests/dtlto/dtlto-thinlto-cache.test +++ b/cross-project-tests/dtlto/dtlto-thinlto-cache.test @@ -29,7 +29,7 @@ RUN: -Wl,--thinlto-cache-dir=cache.dir \ RUN: -Wl,--save-temps # Check that there are two backend compilation jobs occurred. -RUN: grep -wo args populate1.*.dist-file.json | wc -l | grep -qx 3 +RUN: grep -wo args populate1.*.dist-file.json | wc -l | grep -qx "\s*3" RUN: ls cache.dir | count 5 # Clean up cache directory. @@ -45,7 +45,7 @@ RUN: -Wl,--thinlto-cache-dir=cache.dir \ RUN: -Wl,--save-temps # Check that there are two backend compilation jobs occurred. -RUN: grep -wo args populate2.*.dist-file.json | wc -l | grep -qx 3 +RUN: grep -wo args populate2.*.dist-file.json | wc -l | grep -qx "\s*3" RUN: ls cache.dir/llvmcache.timestamp RUN: ls cache.dir | count 3 diff --git a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp index 8437a51471ca2..987586b97dfdc 100644 --- a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp +++ b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp @@ -507,12 +507,12 @@ bool UnwindAssemblyInstEmulation::WriteRegister( case EmulateInstruction::eContextRelativeBranchImmediate: { if (context.GetInfoType() == EmulateInstruction::eInfoTypeISAAndImmediate && context.info.ISAAndImmediate.unsigned_data32 > 0) { - m_forward_branch_offset = - context.info.ISAAndImmediateSigned.signed_data32; + m_forward_branch_offset = context.info.ISAAndImmediate.unsigned_data32; } else if (context.GetInfoType() == EmulateInstruction::eInfoTypeISAAndImmediateSigned && context.info.ISAAndImmediateSigned.signed_data32 > 0) { - m_forward_branch_offset = context.info.ISAAndImmediate.unsigned_data32; + m_forward_branch_offset = + context.info.ISAAndImmediateSigned.signed_data32; } else if (context.GetInfoType() == EmulateInstruction::eInfoTypeImmediate && context.info.unsigned_immediate > 0) { diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 721a8f372d90b..0821b7219111b 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -2178,7 +2178,8 @@ For example: This attribute specifies the possible memory effects of the call-site or function. It allows specifying the possible access kinds (``none``, ``read``, ``write``, or ``readwrite``) for the possible memory location - kinds (``argmem``, ``inaccessiblemem``, ``errnomem``, as well as a default). + kinds (``argmem``, ``inaccessiblemem``, ``errnomem``, ``target_mem0``, + ``target_mem1``, as well as a default). It is best understood by example: - ``memory(none)``: Does not access any memory. @@ -2220,6 +2221,11 @@ For example: accessing inaccessible memory itself). Inaccessible memory is often used to model control dependencies of intrinsics. - ``errnomem``: This refers to accesses to the ``errno`` variable. + - ``target_mem#`` : These refer to target specific state that cannot be + accessed by any other means. # is a number between 0 and 1 inclusive. + Note: The target_mem locations are experimental and intended for internal + testing only. They must not be used in production code. + - The default access kind (specified without a location prefix) applies to all locations that haven't been specified explicitly, including those that don't currently have a dedicated location kind (e.g., accesses to globals diff --git a/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/RemoteJITUtils.cpp b/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/RemoteJITUtils.cpp index 83c5899852d64..6e2aaf32325a9 100644 --- a/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/RemoteJITUtils.cpp +++ b/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/RemoteJITUtils.cpp @@ -9,8 +9,7 @@ #include "RemoteJITUtils.h" #include "llvm/ADT/StringExtras.h" -#include "llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h" -#include "llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h" +#include "llvm/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.h" #include "llvm/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.h" #include "llvm/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.h" #include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h" diff --git a/llvm/include/llvm/Analysis/ScopedNoAliasAA.h b/llvm/include/llvm/Analysis/ScopedNoAliasAA.h index 942cc6f2a4b2b..dbe1afa50ee3a 100644 --- a/llvm/include/llvm/Analysis/ScopedNoAliasAA.h +++ b/llvm/include/llvm/Analysis/ScopedNoAliasAA.h @@ -46,12 +46,12 @@ class ScopedNoAliasAAResult : public AAResultBase { LLVM_ABI ModRefInfo getModRefInfo(const CallBase *Call1, const CallBase *Call2, AAQueryInfo &AAQI); - LLVM_ABI void + LLVM_ABI static void collectScopedDomains(const MDNode *NoAlias, - SmallPtrSetImpl &Domains) const; + SmallPtrSetImpl &Domains); -private: - bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias) const; + LLVM_ABI static bool mayAliasInScopes(const MDNode *Scopes, + const MDNode *NoAlias); }; /// Analysis pass providing a never-invalidated alias analysis result. diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h index bed9f10ce70ca..7fc9ce03e0a67 100644 --- a/llvm/include/llvm/AsmParser/LLToken.h +++ b/llvm/include/llvm/AsmParser/LLToken.h @@ -206,6 +206,8 @@ enum Kind { kw_readwrite, kw_argmem, kw_inaccessiblemem, + kw_target_mem0, + kw_target_mem1, kw_errnomem, // Legacy attributes: diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index cd466dceb900f..cfc8a4243e894 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -1968,6 +1968,10 @@ LLVM_ABI bool isOnesOrOnesSplat(SDValue N, bool AllowUndefs = false); /// Build vector implicit truncation is allowed. LLVM_ABI bool isZeroOrZeroSplat(SDValue N, bool AllowUndefs = false); +/// Return true if the value is a constant (+/-)0.0 floating-point value or a +/// splatted vector thereof (with no undefs). +LLVM_ABI bool isZeroOrZeroSplatFP(SDValue N, bool AllowUndefs = false); + /// Return true if \p V is either a integer or FP constant. inline bool isIntOrFPConstant(SDValue V) { return isa(V) || isa(V); diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.h b/llvm/include/llvm/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.h index 3ca3afa122836..1581f7aca211e 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.h @@ -24,7 +24,7 @@ namespace orc { /// For each object containing debug info, installs JITLink passes to synthesize /// a debug object and then register it via the GDB JIT-registration interface. /// -/// Currently MachO only. For ELF use DebugObjectManagerPlugin. These two +/// Currently MachO only. For ELF use ELFDebugObjectPlugin. These two /// plugins will be merged in the near future. class LLVM_ABI GDBJITDebugInfoRegistrationPlugin : public ObjectLinkingLayer::Plugin { diff --git a/llvm/include/llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h b/llvm/include/llvm/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.h similarity index 87% rename from llvm/include/llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h rename to llvm/include/llvm/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.h index 1988403715f57..d946a029fd2ec 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.h @@ -1,4 +1,4 @@ -//===---- DebugObjectManagerPlugin.h - JITLink debug objects ---*- C++ -*-===// +//===------ ELFDebugObjectPlugin.h - JITLink debug objects ------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -10,8 +10,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_EXECUTIONENGINE_ORC_DEBUGOBJECTMANAGERPLUGIN_H -#define LLVM_EXECUTIONENGINE_ORC_DEBUGOBJECTMANAGERPLUGIN_H +#ifndef LLVM_EXECUTIONENGINE_ORC_ELFDEBUGOBJECTPLUGIN_H +#define LLVM_EXECUTIONENGINE_ORC_ELFDEBUGOBJECTPLUGIN_H #include "llvm/ExecutionEngine/JITLink/JITLink.h" #include "llvm/ExecutionEngine/Orc/Core.h" @@ -46,7 +46,7 @@ class DebugObject; /// DebugObjectRegistrar is notified. Ownership of DebugObjects remains with the /// plugin. /// -class LLVM_ABI DebugObjectManagerPlugin : public ObjectLinkingLayer::Plugin { +class LLVM_ABI ELFDebugObjectPlugin : public ObjectLinkingLayer::Plugin { public: /// Create the plugin to submit DebugObjects for JITLink artifacts. For all /// options the recommended setting is true. @@ -63,9 +63,9 @@ class LLVM_ABI DebugObjectManagerPlugin : public ObjectLinkingLayer::Plugin { /// sequence. When turning this off, the user has to issue the call to /// __jit_debug_register_code() on the executor side manually. /// - DebugObjectManagerPlugin(ExecutionSession &ES, bool RequireDebugSections, - bool AutoRegisterCode, Error &Err); - ~DebugObjectManagerPlugin() override; + ELFDebugObjectPlugin(ExecutionSession &ES, bool RequireDebugSections, + bool AutoRegisterCode, Error &Err); + ~ELFDebugObjectPlugin() override; void notifyMaterializing(MaterializationResponsibility &MR, jitlink::LinkGraph &G, jitlink::JITLinkContext &Ctx, @@ -99,4 +99,4 @@ class LLVM_ABI DebugObjectManagerPlugin : public ObjectLinkingLayer::Plugin { } // namespace orc } // namespace llvm -#endif // LLVM_EXECUTIONENGINE_ORC_DEBUGOBJECTMANAGERPLUGIN_H +#endif // LLVM_EXECUTIONENGINE_ORC_ELFDEBUGOBJECTPLUGIN_H diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index adec819432534..55541416d9c21 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -54,6 +54,25 @@ def IntrInaccessibleMemOnly : IntrinsicProperty; // by the module being compiled. This is a weaker form of IntrArgMemOnly. def IntrInaccessibleMemOrArgMemOnly : IntrinsicProperty; +// Tablegen representation of IRMemLocation. +class IntrinsicMemoryLocation; + +// TODO: Populate with all IRMemLocation enum values and update +// getValueAsIRMemLocation accordingly. +def InaccessibleMem : IntrinsicMemoryLocation; +def TargetMem0 : IntrinsicMemoryLocation; +def TargetMem1 : IntrinsicMemoryLocation; + +// The list of IRMemoryLocations that are read from. +class IntrRead idx> : IntrinsicProperty { + list MemLoc=idx; +} + +// The list of IRMemoryLocations that are write to. +class IntrWrite idx> : IntrinsicProperty { + list MemLoc=idx; +} + // Commutative - This intrinsic is commutative: X op Y == Y op X. def Commutative : IntrinsicProperty; diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index c84c158c57b8e..77fdb8295faa8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -126,8 +126,8 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; class AdvSIMD_1FloatArg_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; - class AdvSIMD_1VectorArg_Intrinsic - : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>; + class AdvSIMD_1VectorArg_Intrinsic Attrs = []> + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], !listconcat(Attrs, [IntrNoMem])>; class AdvSIMD_1VectorArg_Expand_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>; class AdvSIMD_1IntArg_Narrow_Intrinsic @@ -145,9 +145,9 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". class AdvSIMD_2FloatArg_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - class AdvSIMD_2VectorArg_Intrinsic + class AdvSIMD_2VectorArg_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; class AdvSIMD_2Arg_FloatCompare_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>], [IntrNoMem]>; @@ -175,15 +175,14 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; - class AdvSIMD_3IntArg_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - class AdvSIMD_3VectorArg_Intrinsic + class AdvSIMD_3VectorArg_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; class AdvSIMD_3VectorArg_Scalar_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], @@ -1095,124 +1094,124 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_ptr_ty], [IntrArgMemOnly, NoCapture>]>; - class AdvSIMD_SVE_Index_Intrinsic + class AdvSIMD_SVE_Index_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMVectorElementType<0>, LLVMVectorElementType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_Merged1VectorArg_Intrinsic + class AdvSIMD_Merged1VectorArg_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_2VectorArgIndexed_Intrinsic + class AdvSIMD_2VectorArgIndexed_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class AdvSIMD_3VectorArgIndexed_Intrinsic + class AdvSIMD_3VectorArgIndexed_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class AdvSIMD_Pred1VectorArg_Intrinsic + class AdvSIMD_Pred1VectorArg_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_Pred2VectorArg_Intrinsic + class AdvSIMD_Pred2VectorArg_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_Pred3VectorArg_Intrinsic + class AdvSIMD_Pred3VectorArg_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_Compare_Intrinsic + class AdvSIMD_SVE_Compare_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyvector_ty, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_CompareWide_Intrinsic + class AdvSIMD_SVE_CompareWide_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyvector_ty, llvm_nxv2i64_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_Saturating_Intrinsic + class AdvSIMD_SVE_Saturating_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_SaturatingWithPattern_Intrinsic + class AdvSIMD_SVE_SaturatingWithPattern_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, ImmArg>, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>, ImmArg>])>; - class AdvSIMD_SVE_Saturating_N_Intrinsic + class AdvSIMD_SVE_Saturating_N_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[T], [T, llvm_anyvector_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic + class AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[T], [T, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, ImmArg>, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>, ImmArg>])>; - class AdvSIMD_SVE_CNT_Intrinsic + class AdvSIMD_SVE_CNT_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMVectorOfBitcastsToInt<0>], [LLVMVectorOfBitcastsToInt<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyvector_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_ReduceWithInit_Intrinsic + class AdvSIMD_SVE_ReduceWithInit_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMVectorElementType<0>, llvm_anyvector_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_ShiftByImm_Intrinsic + class AdvSIMD_SVE_ShiftByImm_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class AdvSIMD_SVE_ShiftWide_Intrinsic + class AdvSIMD_SVE_ShiftWide_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>, llvm_nxv2i64_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_Unpack_Intrinsic + class AdvSIMD_SVE_Unpack_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMSubdivide2VectorType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; class AdvSIMD_SVE_CADD_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], @@ -1231,31 +1230,31 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". llvm_i32_ty], [IntrNoMem, ImmArg>]>; - class AdvSIMD_SVE_CMLA_LANE_Intrinsic + class AdvSIMD_SVE_CMLA_LANE_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, ImmArg>, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>, ImmArg>])>; - class AdvSIMD_SVE_DUP_Intrinsic + class AdvSIMD_SVE_DUP_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMVectorElementType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_DUP_Unpred_Intrinsic + class AdvSIMD_SVE_DUP_Unpred_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMVectorElementType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_DUPQ_Intrinsic + class AdvSIMD_SVE_DUPQ_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_i64_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; class AdvSIMD_SVE_EXPA_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], @@ -1276,21 +1275,21 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". llvm_anyvector_ty], [IntrNoMem]>; - class AdvSIMD_SVE_INSR_Intrinsic + class AdvSIMD_SVE_INSR_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMVectorElementType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; class AdvSIMD_SVE_PTRUE_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_i32_ty], [IntrNoMem, ImmArg>]>; - class AdvSIMD_SVE_PUNPKHI_Intrinsic + class AdvSIMD_SVE_PUNPKHI_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMOneNthElementsVectorType<0, 2>], [llvm_anyvector_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; class AdvSIMD_SVE_SCALE_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], @@ -1312,191 +1311,192 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". LLVMVectorOfBitcastsToInt<0>], [IntrNoMem]>; - class AdvSIMD_SVE_CNTB_Intrinsic + class AdvSIMD_SVE_CNTB_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class AdvSIMD_SVE_CNTP_Intrinsic + class AdvSIMD_SVE_CNTP_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_anyvector_ty, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_DOT_Intrinsic + class AdvSIMD_SVE_DOT_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMSubdivide4VectorType<0>, LLVMSubdivide4VectorType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_DOT_Indexed_Intrinsic + class AdvSIMD_SVE_DOT_Indexed_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMSubdivide4VectorType<0>, LLVMSubdivide4VectorType<0>, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class AdvSIMD_SVE_PTEST_Intrinsic + class AdvSIMD_SVE_PTEST_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_anyvector_ty, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_TBL_Intrinsic + class AdvSIMD_SVE_TBL_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMVectorOfBitcastsToInt<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE2_TBX_Intrinsic + class AdvSIMD_SVE2_TBX_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMVectorOfBitcastsToInt<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class SVE2_LUTI_Inrinsic + class SVE2_LUTI_Inrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_nxv16i8_ty, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class SVE2_1VectorArg_Long_Intrinsic + class SVE2_1VectorArg_Long_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMSubdivide2VectorType<0>, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class SVE2_2VectorArg_Long_Intrinsic + class SVE2_2VectorArg_Long_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class SVE2_2VectorArgIndexed_Long_Intrinsic + class SVE2_2VectorArgIndexed_Long_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class SVE2_2VectorArg_Wide_Intrinsic + class SVE2_2VectorArg_Wide_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMSubdivide2VectorType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class SVE2_2VectorArg_Pred_Long_Intrinsic + class SVE2_2VectorArg_Pred_Long_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>, LLVMSubdivide2VectorType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class SVE2_3VectorArg_Long_Intrinsic + class SVE2_3VectorArg_Long_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class SVE2_3VectorArgIndexed_Long_Intrinsic + class SVE2_3VectorArgIndexed_Long_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class SVE2_1VectorArg_Narrowing_Intrinsic + class SVE2_1VectorArg_Narrowing_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>], [llvm_anyvector_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class SVE2_Merged1VectorArg_Narrowing_Intrinsic + class SVE2_Merged1VectorArg_Narrowing_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>], [LLVMSubdivide2VectorType<0>, llvm_anyvector_ty], - [IntrNoMem]>; - class SVE2_2VectorArg_Narrowing_Intrinsic + !listconcat(Attrs, [IntrNoMem])>; + + class SVE2_2VectorArg_Narrowing_Intrinsic Attrs = []> : DefaultAttrsIntrinsic< [LLVMSubdivide2VectorType<0>], [llvm_anyvector_ty, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class SVE2_Merged2VectorArg_Narrowing_Intrinsic + class SVE2_Merged2VectorArg_Narrowing_Intrinsic Attrs = []> : DefaultAttrsIntrinsic< [LLVMSubdivide2VectorType<0>], [LLVMSubdivide2VectorType<0>, llvm_anyvector_ty, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class SVE2_1VectorArg_Imm_Narrowing_Intrinsic + class SVE2_1VectorArg_Imm_Narrowing_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>], [llvm_anyvector_ty, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class SVE2_2VectorArg_Imm_Narrowing_Intrinsic + class SVE2_2VectorArg_Imm_Narrowing_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>], [LLVMSubdivide2VectorType<0>, llvm_anyvector_ty, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class SVE2_CONFLICT_DETECT_Intrinsic + class SVE2_CONFLICT_DETECT_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyptr_ty, LLVMMatchType<1>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class SVE2_3VectorArg_Indexed_Intrinsic + class SVE2_3VectorArg_Indexed_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class SVE2_1VectorArgIndexed_Intrinsic + class SVE2_1VectorArgIndexed_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class AdvSIMD_SVE_CDOT_LANE_Intrinsic + class AdvSIMD_SVE_CDOT_LANE_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMSubdivide4VectorType<0>, LLVMSubdivide4VectorType<0>, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, ImmArg>, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>, ImmArg>])>; - class SVE2_1VectorArg_Pred_Intrinsic + class SVE2_1VectorArg_Pred_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [llvm_anyvector_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class SVE2_1VectorArgIndexed_Pred_Intrinsic + class SVE2_1VectorArgIndexed_Pred_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [llvm_anyvector_ty, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class SVE2_Pred_1VectorArgIndexed_Intrinsic + class SVE2_Pred_1VectorArgIndexed_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class SVE2_Pred_1VectorArg_Intrinsic + class SVE2_Pred_1VectorArg_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; // NOTE: There is no relationship between these intrinsics beyond an attempt // to reuse currently identical class definitions. - class AdvSIMD_SVE_LOGB_Intrinsic : AdvSIMD_SVE_CNT_Intrinsic; - class AdvSIMD_SVE2_CADD_Intrinsic : AdvSIMD_2VectorArgIndexed_Intrinsic; - class AdvSIMD_SVE2_CMLA_Intrinsic : AdvSIMD_3VectorArgIndexed_Intrinsic; + class AdvSIMD_SVE_LOGB_Intrinsic : AdvSIMD_SVE_CNT_Intrinsic; + class AdvSIMD_SVE2_CADD_Intrinsic Attrs = []> : AdvSIMD_2VectorArgIndexed_Intrinsic; + class AdvSIMD_SVE2_CMLA_Intrinsic Attrs = []> : AdvSIMD_3VectorArgIndexed_Intrinsic; // This class of intrinsics are not intended to be useful within LLVM IR but // are instead here to support some of the more regid parts of the ACLE. @@ -1509,39 +1509,39 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". -class AdvSIMD_SVE_2SVBoolArg_Intrinsic +class AdvSIMD_SVE_2SVBoolArg_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_nxv16i1_ty], [llvm_nxv16i1_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; -class AdvSIMD_SVE_3SVBoolArg_Intrinsic +class AdvSIMD_SVE_3SVBoolArg_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_nxv16i1_ty], [llvm_nxv16i1_ty, llvm_nxv16i1_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; -class AdvSIMD_SVE_Reduce_Intrinsic +class AdvSIMD_SVE_Reduce_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyvector_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; -class AdvSIMD_SVE_V128_Reduce_Intrinsic +class AdvSIMD_SVE_V128_Reduce_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<1, llvm_i1_ty>, llvm_anyvector_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; -class AdvSIMD_SVE_SADDV_Reduce_Intrinsic +class AdvSIMD_SVE_SADDV_Reduce_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_i64_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyvector_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; -class AdvSIMD_SVE_WHILE_Intrinsic +class AdvSIMD_SVE_WHILE_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyint_ty, LLVMMatchType<1>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; class AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], @@ -1684,10 +1684,10 @@ class SVE_gather_prf_VS ], [IntrInaccessibleMemOrArgMemOnly, ImmArg>]>; -class SVE_MatMul_Intrinsic +class SVE_MatMul_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMSubdivide4VectorType<0>, LLVMSubdivide4VectorType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; class SVE_4Vec_BF16 : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty], @@ -1765,159 +1765,158 @@ def int_aarch64_sve_prfd_gather_scalar_offset : SVE_gather_prf_VS; // Scalar to vector operations // -def int_aarch64_sve_dup : AdvSIMD_SVE_DUP_Intrinsic; -def int_aarch64_sve_dup_x : AdvSIMD_SVE_DUP_Unpred_Intrinsic; +def int_aarch64_sve_dup : AdvSIMD_SVE_DUP_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_dup_x : AdvSIMD_SVE_DUP_Unpred_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_index : AdvSIMD_SVE_Index_Intrinsic; +def int_aarch64_sve_index : AdvSIMD_SVE_Index_Intrinsic<[IntrSpeculatable]>; // // Address calculation // -def int_aarch64_sve_adrb : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_adrh : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_adrw : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_adrd : AdvSIMD_2VectorArg_Intrinsic; +def int_aarch64_sve_adrb : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_adrh : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_adrw : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_adrd : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; // // Integer arithmetic // -def int_aarch64_sve_add : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_add_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sub : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sub_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_subr : AdvSIMD_Pred2VectorArg_Intrinsic; - -def int_aarch64_sve_pmul : AdvSIMD_2VectorArg_Intrinsic; - -def int_aarch64_sve_mul : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_mul_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_mul_lane : AdvSIMD_2VectorArgIndexed_Intrinsic; -def int_aarch64_sve_smulh : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_smulh_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_umulh : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_umulh_u : AdvSIMD_Pred2VectorArg_Intrinsic; - -def int_aarch64_sve_sdiv : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sdiv_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_udiv : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_udiv_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sdivr : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_udivr : AdvSIMD_Pred2VectorArg_Intrinsic; - -def int_aarch64_sve_smax : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_smax_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_umax : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_umax_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_smin : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_smin_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_umin : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_umin_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sabd : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sabd_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uabd : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uabd_u : AdvSIMD_Pred2VectorArg_Intrinsic; - -def int_aarch64_sve_mad : AdvSIMD_Pred3VectorArg_Intrinsic; -def int_aarch64_sve_msb : AdvSIMD_Pred3VectorArg_Intrinsic; -def int_aarch64_sve_mla : AdvSIMD_Pred3VectorArg_Intrinsic; -def int_aarch64_sve_mla_u : AdvSIMD_Pred3VectorArg_Intrinsic; -def int_aarch64_sve_mla_lane : AdvSIMD_3VectorArgIndexed_Intrinsic; -def int_aarch64_sve_mls : AdvSIMD_Pred3VectorArg_Intrinsic; -def int_aarch64_sve_mls_u : AdvSIMD_Pred3VectorArg_Intrinsic; -def int_aarch64_sve_mls_lane : AdvSIMD_3VectorArgIndexed_Intrinsic; - -def int_aarch64_sve_saddv : AdvSIMD_SVE_SADDV_Reduce_Intrinsic; -def int_aarch64_sve_uaddv : AdvSIMD_SVE_SADDV_Reduce_Intrinsic; - -def int_aarch64_sve_smaxv : AdvSIMD_SVE_Reduce_Intrinsic; -def int_aarch64_sve_umaxv : AdvSIMD_SVE_Reduce_Intrinsic; -def int_aarch64_sve_sminv : AdvSIMD_SVE_Reduce_Intrinsic; -def int_aarch64_sve_uminv : AdvSIMD_SVE_Reduce_Intrinsic; - -def int_aarch64_sve_orv : AdvSIMD_SVE_Reduce_Intrinsic; -def int_aarch64_sve_eorv : AdvSIMD_SVE_Reduce_Intrinsic; -def int_aarch64_sve_andv : AdvSIMD_SVE_Reduce_Intrinsic; - -def int_aarch64_sve_abs : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_neg : AdvSIMD_Merged1VectorArg_Intrinsic; - -def int_aarch64_sve_sdot : AdvSIMD_SVE_DOT_Intrinsic; -def int_aarch64_sve_sdot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic; - -def int_aarch64_sve_udot : AdvSIMD_SVE_DOT_Intrinsic; -def int_aarch64_sve_udot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic; - -def int_aarch64_sve_sqadd_x : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_sqsub_x : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_uqadd_x : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_uqsub_x : AdvSIMD_2VectorArg_Intrinsic; - -def int_aarch64_sve_orqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; -def int_aarch64_sve_eorqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; -def int_aarch64_sve_andqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; -def int_aarch64_sve_addqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; -def int_aarch64_sve_smaxqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; -def int_aarch64_sve_umaxqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; -def int_aarch64_sve_sminqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; -def int_aarch64_sve_uminqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; - +def int_aarch64_sve_add : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_add_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sub : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sub_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_subr : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_pmul : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_mul : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_mul_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_mul_lane : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smulh : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smulh_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umulh : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umulh_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_sdiv : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sdiv_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_udiv : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_udiv_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sdivr : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_udivr : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_smax : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smax_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umax : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umax_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smin : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smin_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umin : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umin_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sabd : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sabd_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uabd : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uabd_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_mad : AdvSIMD_Pred3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_msb : AdvSIMD_Pred3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_mla : AdvSIMD_Pred3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_mla_u : AdvSIMD_Pred3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_mla_lane : AdvSIMD_3VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_mls : AdvSIMD_Pred3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_mls_u : AdvSIMD_Pred3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_mls_lane : AdvSIMD_3VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_saddv : AdvSIMD_SVE_SADDV_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uaddv : AdvSIMD_SVE_SADDV_Reduce_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_smaxv : AdvSIMD_SVE_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umaxv : AdvSIMD_SVE_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sminv : AdvSIMD_SVE_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uminv : AdvSIMD_SVE_Reduce_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_orv : AdvSIMD_SVE_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_eorv : AdvSIMD_SVE_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_andv : AdvSIMD_SVE_Reduce_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_abs : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_neg : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_sdot : AdvSIMD_SVE_DOT_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sdot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_udot : AdvSIMD_SVE_DOT_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_udot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_sqadd_x : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqsub_x : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqadd_x : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqsub_x : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_orqv : AdvSIMD_SVE_V128_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_eorqv : AdvSIMD_SVE_V128_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_andqv : AdvSIMD_SVE_V128_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_addqv : AdvSIMD_SVE_V128_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smaxqv : AdvSIMD_SVE_V128_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umaxqv : AdvSIMD_SVE_V128_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sminqv : AdvSIMD_SVE_V128_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uminqv : AdvSIMD_SVE_V128_Reduce_Intrinsic<[IntrSpeculatable]>; // Shifts -def int_aarch64_sve_asr : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_asr_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_asr_wide : AdvSIMD_SVE_ShiftWide_Intrinsic; -def int_aarch64_sve_asrd : AdvSIMD_SVE_ShiftByImm_Intrinsic; -def int_aarch64_sve_insr : AdvSIMD_SVE_INSR_Intrinsic; -def int_aarch64_sve_lsl : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_lsl_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_lsl_wide : AdvSIMD_SVE_ShiftWide_Intrinsic; -def int_aarch64_sve_lsr : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_lsr_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_lsr_wide : AdvSIMD_SVE_ShiftWide_Intrinsic; +def int_aarch64_sve_asr : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_asr_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_asr_wide : AdvSIMD_SVE_ShiftWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_asrd : AdvSIMD_SVE_ShiftByImm_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_insr : AdvSIMD_SVE_INSR_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_lsl : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_lsl_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_lsl_wide : AdvSIMD_SVE_ShiftWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_lsr : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_lsr_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_lsr_wide : AdvSIMD_SVE_ShiftWide_Intrinsic<[IntrSpeculatable]>; // // Integer comparisons // -def int_aarch64_sve_cmpeq : AdvSIMD_SVE_Compare_Intrinsic; -def int_aarch64_sve_cmpge : AdvSIMD_SVE_Compare_Intrinsic; -def int_aarch64_sve_cmpgt : AdvSIMD_SVE_Compare_Intrinsic; -def int_aarch64_sve_cmphi : AdvSIMD_SVE_Compare_Intrinsic; -def int_aarch64_sve_cmphs : AdvSIMD_SVE_Compare_Intrinsic; -def int_aarch64_sve_cmpne : AdvSIMD_SVE_Compare_Intrinsic; +def int_aarch64_sve_cmpeq : AdvSIMD_SVE_Compare_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmpge : AdvSIMD_SVE_Compare_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmpgt : AdvSIMD_SVE_Compare_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmphi : AdvSIMD_SVE_Compare_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmphs : AdvSIMD_SVE_Compare_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmpne : AdvSIMD_SVE_Compare_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_cmpeq_wide : AdvSIMD_SVE_CompareWide_Intrinsic; -def int_aarch64_sve_cmpge_wide : AdvSIMD_SVE_CompareWide_Intrinsic; -def int_aarch64_sve_cmpgt_wide : AdvSIMD_SVE_CompareWide_Intrinsic; -def int_aarch64_sve_cmphi_wide : AdvSIMD_SVE_CompareWide_Intrinsic; -def int_aarch64_sve_cmphs_wide : AdvSIMD_SVE_CompareWide_Intrinsic; -def int_aarch64_sve_cmple_wide : AdvSIMD_SVE_CompareWide_Intrinsic; -def int_aarch64_sve_cmplo_wide : AdvSIMD_SVE_CompareWide_Intrinsic; -def int_aarch64_sve_cmpls_wide : AdvSIMD_SVE_CompareWide_Intrinsic; -def int_aarch64_sve_cmplt_wide : AdvSIMD_SVE_CompareWide_Intrinsic; -def int_aarch64_sve_cmpne_wide : AdvSIMD_SVE_CompareWide_Intrinsic; +def int_aarch64_sve_cmpeq_wide : AdvSIMD_SVE_CompareWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmpge_wide : AdvSIMD_SVE_CompareWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmpgt_wide : AdvSIMD_SVE_CompareWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmphi_wide : AdvSIMD_SVE_CompareWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmphs_wide : AdvSIMD_SVE_CompareWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmple_wide : AdvSIMD_SVE_CompareWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmplo_wide : AdvSIMD_SVE_CompareWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmpls_wide : AdvSIMD_SVE_CompareWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmplt_wide : AdvSIMD_SVE_CompareWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmpne_wide : AdvSIMD_SVE_CompareWide_Intrinsic<[IntrSpeculatable]>; // // Counting bits // -def int_aarch64_sve_cls : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_clz : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_cnt : AdvSIMD_SVE_CNT_Intrinsic; +def int_aarch64_sve_cls : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_clz : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cnt : AdvSIMD_SVE_CNT_Intrinsic<[IntrSpeculatable]>; // // Counting elements // -def int_aarch64_sve_cntb : AdvSIMD_SVE_CNTB_Intrinsic; -def int_aarch64_sve_cnth : AdvSIMD_SVE_CNTB_Intrinsic; -def int_aarch64_sve_cntw : AdvSIMD_SVE_CNTB_Intrinsic; -def int_aarch64_sve_cntd : AdvSIMD_SVE_CNTB_Intrinsic; +def int_aarch64_sve_cntb : AdvSIMD_SVE_CNTB_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cnth : AdvSIMD_SVE_CNTB_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cntw : AdvSIMD_SVE_CNTB_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cntd : AdvSIMD_SVE_CNTB_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_cntp : AdvSIMD_SVE_CNTP_Intrinsic; +def int_aarch64_sve_cntp : AdvSIMD_SVE_CNTP_Intrinsic<[IntrSpeculatable]>; // // FFR manipulation @@ -1932,173 +1931,173 @@ def int_aarch64_sve_wrffr : ClangBuiltin<"__builtin_sve_svwrffr">, DefaultAt // Saturating scalar arithmetic // -def int_aarch64_sve_sqdech : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_sqdecw : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_sqdecd : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_sqdecp : AdvSIMD_SVE_Saturating_Intrinsic; - -def int_aarch64_sve_sqdecb_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqdecb_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqdech_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqdech_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqdecw_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqdecw_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqdecd_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqdecd_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqdecp_n32 : AdvSIMD_SVE_Saturating_N_Intrinsic; -def int_aarch64_sve_sqdecp_n64 : AdvSIMD_SVE_Saturating_N_Intrinsic; - -def int_aarch64_sve_sqinch : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_sqincw : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_sqincd : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_sqincp : AdvSIMD_SVE_Saturating_Intrinsic; - -def int_aarch64_sve_sqincb_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqincb_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqinch_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqinch_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqincw_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqincw_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqincd_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqincd_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqincp_n32 : AdvSIMD_SVE_Saturating_N_Intrinsic; -def int_aarch64_sve_sqincp_n64 : AdvSIMD_SVE_Saturating_N_Intrinsic; - -def int_aarch64_sve_uqdech : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_uqdecw : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_uqdecd : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_uqdecp : AdvSIMD_SVE_Saturating_Intrinsic; - -def int_aarch64_sve_uqdecb_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqdecb_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqdech_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqdech_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqdecw_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqdecw_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqdecd_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqdecd_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqdecp_n32 : AdvSIMD_SVE_Saturating_N_Intrinsic; -def int_aarch64_sve_uqdecp_n64 : AdvSIMD_SVE_Saturating_N_Intrinsic; - -def int_aarch64_sve_uqinch : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_uqincw : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_uqincd : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_uqincp : AdvSIMD_SVE_Saturating_Intrinsic; - -def int_aarch64_sve_uqincb_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqincb_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqinch_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqinch_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqincw_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqincw_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqincd_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqincd_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqincp_n32 : AdvSIMD_SVE_Saturating_N_Intrinsic; -def int_aarch64_sve_uqincp_n64 : AdvSIMD_SVE_Saturating_N_Intrinsic; +def int_aarch64_sve_sqdech : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdecw : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdecd : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdecp : AdvSIMD_SVE_Saturating_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_sqdecb_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqdecb_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqdech_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqdech_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqdecw_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqdecw_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqdecd_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqdecd_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqdecp_n32 : AdvSIMD_SVE_Saturating_N_Intrinsic; +def int_aarch64_sve_sqdecp_n64 : AdvSIMD_SVE_Saturating_N_Intrinsic; + +def int_aarch64_sve_sqinch : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqincw : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqincd : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqincp : AdvSIMD_SVE_Saturating_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_sqincb_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqincb_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqinch_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqinch_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqincw_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqincw_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqincd_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqincd_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqincp_n32 : AdvSIMD_SVE_Saturating_N_Intrinsic; +def int_aarch64_sve_sqincp_n64 : AdvSIMD_SVE_Saturating_N_Intrinsic; + +def int_aarch64_sve_uqdech : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqdecw : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqdecd : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqdecp : AdvSIMD_SVE_Saturating_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_uqdecb_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqdecb_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqdech_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqdech_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqdecw_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqdecw_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqdecd_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqdecd_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqdecp_n32 : AdvSIMD_SVE_Saturating_N_Intrinsic; +def int_aarch64_sve_uqdecp_n64 : AdvSIMD_SVE_Saturating_N_Intrinsic; + +def int_aarch64_sve_uqinch : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqincw : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqincd : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqincp : AdvSIMD_SVE_Saturating_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_uqincb_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqincb_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqinch_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqinch_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqincw_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqincw_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqincd_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqincd_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqincp_n32 : AdvSIMD_SVE_Saturating_N_Intrinsic; +def int_aarch64_sve_uqincp_n64 : AdvSIMD_SVE_Saturating_N_Intrinsic; // // Reversal // -def int_aarch64_sve_rbit : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_revb : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_revh : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_revw : AdvSIMD_Merged1VectorArg_Intrinsic; +def int_aarch64_sve_rbit : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_revb : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_revh : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_revw : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; // // Permutations and selection // -def int_aarch64_sve_clasta : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_clasta_n : AdvSIMD_SVE_ReduceWithInit_Intrinsic; -def int_aarch64_sve_clastb : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_clastb_n : AdvSIMD_SVE_ReduceWithInit_Intrinsic; -def int_aarch64_sve_compact : AdvSIMD_Pred1VectorArg_Intrinsic; -def int_aarch64_sve_dupq_lane : AdvSIMD_SVE_DUPQ_Intrinsic; -def int_aarch64_sve_dup_laneq : SVE2_1VectorArgIndexed_Intrinsic; -def int_aarch64_sve_ext : AdvSIMD_2VectorArgIndexed_Intrinsic; -def int_aarch64_sve_sel : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_lasta : AdvSIMD_SVE_Reduce_Intrinsic; -def int_aarch64_sve_lastb : AdvSIMD_SVE_Reduce_Intrinsic; -def int_aarch64_sve_rev : AdvSIMD_1VectorArg_Intrinsic; -def int_aarch64_sve_rev_b16 : AdvSIMD_SVE_2SVBoolArg_Intrinsic; -def int_aarch64_sve_rev_b32 : AdvSIMD_SVE_2SVBoolArg_Intrinsic; -def int_aarch64_sve_rev_b64 : AdvSIMD_SVE_2SVBoolArg_Intrinsic; -def int_aarch64_sve_splice : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sunpkhi : AdvSIMD_SVE_Unpack_Intrinsic; -def int_aarch64_sve_sunpklo : AdvSIMD_SVE_Unpack_Intrinsic; -def int_aarch64_sve_tbl : AdvSIMD_SVE_TBL_Intrinsic; -def int_aarch64_sve_trn1 : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_trn1_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_trn1_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_trn1_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_trn2 : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_trn2_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_trn2_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_trn2_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_trn1q : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_trn2q : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_uunpkhi : AdvSIMD_SVE_Unpack_Intrinsic; -def int_aarch64_sve_uunpklo : AdvSIMD_SVE_Unpack_Intrinsic; -def int_aarch64_sve_uzp1 : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_uzp1_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_uzp1_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_uzp1_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_uzp2 : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_uzp2_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_uzp2_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_uzp2_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_uzp1q : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_uzp2q : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_zip1 : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_zip1_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_zip1_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_zip1_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_zip2 : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_zip2_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_zip2_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_zip2_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_zip1q : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_zip2q : AdvSIMD_2VectorArg_Intrinsic; +def int_aarch64_sve_clasta : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_clasta_n : AdvSIMD_SVE_ReduceWithInit_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_clastb : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_clastb_n : AdvSIMD_SVE_ReduceWithInit_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_compact : AdvSIMD_Pred1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_dupq_lane : AdvSIMD_SVE_DUPQ_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_dup_laneq : SVE2_1VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ext : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sel : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_lasta : AdvSIMD_SVE_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_lastb : AdvSIMD_SVE_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_rev : AdvSIMD_1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_rev_b16 : AdvSIMD_SVE_2SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_rev_b32 : AdvSIMD_SVE_2SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_rev_b64 : AdvSIMD_SVE_2SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_splice : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sunpkhi : AdvSIMD_SVE_Unpack_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sunpklo : AdvSIMD_SVE_Unpack_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_tbl : AdvSIMD_SVE_TBL_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_trn1 : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_trn1_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_trn1_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_trn1_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_trn2 : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_trn2_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_trn2_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_trn2_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_trn1q : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_trn2q : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uunpkhi : AdvSIMD_SVE_Unpack_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uunpklo : AdvSIMD_SVE_Unpack_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzp1 : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzp1_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzp1_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzp1_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzp2 : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzp2_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzp2_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzp2_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzp1q : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzp2q : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zip1 : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zip1_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zip1_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zip1_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zip2 : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zip2_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zip2_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zip2_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zip1q : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zip2q : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; // // Logical operations // -def int_aarch64_sve_and : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_and_u: AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_bic : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_bic_u: AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_cnot : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_eor : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_eor_u: AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_not : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_orr : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_orr_u: AdvSIMD_Pred2VectorArg_Intrinsic; +def int_aarch64_sve_and : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_and_u: AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_bic : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_bic_u: AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cnot : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_eor : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_eor_u: AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_not : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_orr : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_orr_u: AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; // // Conversion // -def int_aarch64_sve_sxtb : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_sxth : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_sxtw : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_uxtb : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_uxth : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_uxtw : AdvSIMD_Merged1VectorArg_Intrinsic; +def int_aarch64_sve_sxtb : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sxth : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sxtw : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uxtb : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uxth : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uxtw : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; // // While comparisons // -def int_aarch64_sve_whilele : AdvSIMD_SVE_WHILE_Intrinsic; -def int_aarch64_sve_whilelo : AdvSIMD_SVE_WHILE_Intrinsic; -def int_aarch64_sve_whilels : AdvSIMD_SVE_WHILE_Intrinsic; -def int_aarch64_sve_whilelt : AdvSIMD_SVE_WHILE_Intrinsic; -def int_aarch64_sve_whilege : AdvSIMD_SVE_WHILE_Intrinsic; -def int_aarch64_sve_whilegt : AdvSIMD_SVE_WHILE_Intrinsic; -def int_aarch64_sve_whilehs : AdvSIMD_SVE_WHILE_Intrinsic; -def int_aarch64_sve_whilehi : AdvSIMD_SVE_WHILE_Intrinsic; +def int_aarch64_sve_whilele : AdvSIMD_SVE_WHILE_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilelo : AdvSIMD_SVE_WHILE_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilels : AdvSIMD_SVE_WHILE_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilelt : AdvSIMD_SVE_WHILE_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilege : AdvSIMD_SVE_WHILE_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilegt : AdvSIMD_SVE_WHILE_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilehs : AdvSIMD_SVE_WHILE_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilehi : AdvSIMD_SVE_WHILE_Intrinsic<[IntrSpeculatable]>; // // Floating-point arithmetic @@ -2254,32 +2253,32 @@ def int_aarch64_sve_ptrue : AdvSIMD_SVE_PTRUE_Intrinsic; // Predicate operations // -def int_aarch64_sve_and_z : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_bic_z : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_brka : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_brka_z : AdvSIMD_Pred1VectorArg_Intrinsic; -def int_aarch64_sve_brkb : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_brkb_z : AdvSIMD_Pred1VectorArg_Intrinsic; -def int_aarch64_sve_brkn_z : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_brkpa_z : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_brkpb_z : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_eor_z : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_nand_z : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_nor_z : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_orn_z : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_orr_z : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_pfirst : AdvSIMD_Pred1VectorArg_Intrinsic; -def int_aarch64_sve_pnext : AdvSIMD_Pred1VectorArg_Intrinsic; -def int_aarch64_sve_punpkhi : AdvSIMD_SVE_PUNPKHI_Intrinsic; -def int_aarch64_sve_punpklo : AdvSIMD_SVE_PUNPKHI_Intrinsic; +def int_aarch64_sve_and_z : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_bic_z : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_brka : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_brka_z : AdvSIMD_Pred1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_brkb : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_brkb_z : AdvSIMD_Pred1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_brkn_z : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_brkpa_z : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_brkpb_z : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_eor_z : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_nand_z : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_nor_z : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_orn_z : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_orr_z : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_pfirst : AdvSIMD_Pred1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_pnext : AdvSIMD_Pred1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_punpkhi : AdvSIMD_SVE_PUNPKHI_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_punpklo : AdvSIMD_SVE_PUNPKHI_Intrinsic<[IntrSpeculatable]>; // // Testing predicates // -def int_aarch64_sve_ptest_any : AdvSIMD_SVE_PTEST_Intrinsic; -def int_aarch64_sve_ptest_first : AdvSIMD_SVE_PTEST_Intrinsic; -def int_aarch64_sve_ptest_last : AdvSIMD_SVE_PTEST_Intrinsic; +def int_aarch64_sve_ptest_any : AdvSIMD_SVE_PTEST_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ptest_first : AdvSIMD_SVE_PTEST_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ptest_last : AdvSIMD_SVE_PTEST_Intrinsic<[IntrSpeculatable]>; // // Reinterpreting data @@ -2287,11 +2286,11 @@ def int_aarch64_sve_ptest_last : AdvSIMD_SVE_PTEST_Intrinsic; def int_aarch64_sve_convert_from_svbool : DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_nxv16i1_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; def int_aarch64_sve_convert_to_svbool : DefaultAttrsIntrinsic<[llvm_nxv16i1_ty], [llvm_any_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; // // Gather loads: scalar base + vector offsets @@ -2434,134 +2433,134 @@ def int_aarch64_sve_stnt1_scatter_scalar_offset : AdvSIMD_ScatterStore_VS_Intri // SVE2 - Uniform DSP operations // -def int_aarch64_sve_saba : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_shadd : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_shsub : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_shsubr : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sli : AdvSIMD_2VectorArgIndexed_Intrinsic; -def int_aarch64_sve_sqabs : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_sqadd : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sqdmulh : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_sqdmulh_lane : AdvSIMD_2VectorArgIndexed_Intrinsic; -def int_aarch64_sve_sqneg : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_sqrdmlah : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_sqrdmlah_lane : AdvSIMD_3VectorArgIndexed_Intrinsic; -def int_aarch64_sve_sqrdmlsh : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_sqrdmlsh_lane : AdvSIMD_3VectorArgIndexed_Intrinsic; -def int_aarch64_sve_sqrdmulh : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_sqrdmulh_lane : AdvSIMD_2VectorArgIndexed_Intrinsic; -def int_aarch64_sve_sqrshl : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sqshl : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sqshlu : AdvSIMD_SVE_ShiftByImm_Intrinsic; -def int_aarch64_sve_sqsub : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sqsub_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sqsubr : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_srhadd : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sri : AdvSIMD_2VectorArgIndexed_Intrinsic; -def int_aarch64_sve_srshl : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_srshr : AdvSIMD_SVE_ShiftByImm_Intrinsic; -def int_aarch64_sve_srsra : AdvSIMD_2VectorArgIndexed_Intrinsic; -def int_aarch64_sve_ssra : AdvSIMD_2VectorArgIndexed_Intrinsic; -def int_aarch64_sve_suqadd : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uaba : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_uhadd : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uhsub : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uhsubr : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uqadd : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uqrshl : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uqshl : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uqsub : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uqsub_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uqsubr : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_urecpe : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_urhadd : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_urshl : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_urshr : AdvSIMD_SVE_ShiftByImm_Intrinsic; -def int_aarch64_sve_ursqrte : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_ursra : AdvSIMD_2VectorArgIndexed_Intrinsic; -def int_aarch64_sve_usqadd : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_usra : AdvSIMD_2VectorArgIndexed_Intrinsic; +def int_aarch64_sve_saba : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_shadd : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_shsub : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_shsubr : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sli : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqabs : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqadd : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmulh : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmulh_lane : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqneg : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrdmlah : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrdmlah_lane : AdvSIMD_3VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrdmlsh : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrdmlsh_lane : AdvSIMD_3VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrdmulh : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrdmulh_lane : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrshl : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqshl : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqshlu : AdvSIMD_SVE_ShiftByImm_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqsub : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqsub_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqsubr : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_srhadd : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sri : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_srshl : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_srshr : AdvSIMD_SVE_ShiftByImm_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_srsra : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ssra : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_suqadd : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uaba : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uhadd : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uhsub : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uhsubr : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqadd : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqrshl : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqshl : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqsub : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqsub_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqsubr : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_urecpe : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_urhadd : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_urshl : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_urshr : AdvSIMD_SVE_ShiftByImm_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ursqrte : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ursra : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_usqadd : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_usra : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Widening DSP operations // -def int_aarch64_sve_sabalb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_sabalt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_sabdlb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_sabdlt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_saddlb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_saddlt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_saddwb : SVE2_2VectorArg_Wide_Intrinsic; -def int_aarch64_sve_saddwt : SVE2_2VectorArg_Wide_Intrinsic; -def int_aarch64_sve_sshllb : SVE2_1VectorArg_Long_Intrinsic; -def int_aarch64_sve_sshllt : SVE2_1VectorArg_Long_Intrinsic; -def int_aarch64_sve_ssublb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_ssublt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_ssubwb : SVE2_2VectorArg_Wide_Intrinsic; -def int_aarch64_sve_ssubwt : SVE2_2VectorArg_Wide_Intrinsic; -def int_aarch64_sve_uabalb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_uabalt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_uabdlb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_uabdlt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_uaddlb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_uaddlt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_uaddwb : SVE2_2VectorArg_Wide_Intrinsic; -def int_aarch64_sve_uaddwt : SVE2_2VectorArg_Wide_Intrinsic; -def int_aarch64_sve_ushllb : SVE2_1VectorArg_Long_Intrinsic; -def int_aarch64_sve_ushllt : SVE2_1VectorArg_Long_Intrinsic; -def int_aarch64_sve_usublb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_usublt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_usubwb : SVE2_2VectorArg_Wide_Intrinsic; -def int_aarch64_sve_usubwt : SVE2_2VectorArg_Wide_Intrinsic; +def int_aarch64_sve_sabalb : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sabalt : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sabdlb : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sabdlt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_saddlb : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_saddlt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_saddwb : SVE2_2VectorArg_Wide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_saddwt : SVE2_2VectorArg_Wide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sshllb : SVE2_1VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sshllt : SVE2_1VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ssublb : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ssublt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ssubwb : SVE2_2VectorArg_Wide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ssubwt : SVE2_2VectorArg_Wide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uabalb : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uabalt : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uabdlb : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uabdlt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uaddlb : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uaddlt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uaddwb : SVE2_2VectorArg_Wide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uaddwt : SVE2_2VectorArg_Wide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ushllb : SVE2_1VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ushllt : SVE2_1VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_usublb : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_usublt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_usubwb : SVE2_2VectorArg_Wide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_usubwt : SVE2_2VectorArg_Wide_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Non-widening pairwise arithmetic // -def int_aarch64_sve_addp : AdvSIMD_Pred2VectorArg_Intrinsic; +def int_aarch64_sve_addp : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; def int_aarch64_sve_faddp : AdvSIMD_Pred2VectorArg_Intrinsic; def int_aarch64_sve_fmaxp : AdvSIMD_Pred2VectorArg_Intrinsic; def int_aarch64_sve_fmaxnmp : AdvSIMD_Pred2VectorArg_Intrinsic; def int_aarch64_sve_fminp : AdvSIMD_Pred2VectorArg_Intrinsic; def int_aarch64_sve_fminnmp : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_smaxp : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sminp : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_umaxp : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uminp : AdvSIMD_Pred2VectorArg_Intrinsic; +def int_aarch64_sve_smaxp : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sminp : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umaxp : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uminp : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Widening pairwise arithmetic // -def int_aarch64_sve_sadalp : SVE2_2VectorArg_Pred_Long_Intrinsic; -def int_aarch64_sve_uadalp : SVE2_2VectorArg_Pred_Long_Intrinsic; +def int_aarch64_sve_sadalp : SVE2_2VectorArg_Pred_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uadalp : SVE2_2VectorArg_Pred_Long_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Uniform complex integer arithmetic // -def int_aarch64_sve_cadd_x : AdvSIMD_SVE2_CADD_Intrinsic; -def int_aarch64_sve_sqcadd_x : AdvSIMD_SVE2_CADD_Intrinsic; -def int_aarch64_sve_cmla_x : AdvSIMD_SVE2_CMLA_Intrinsic; -def int_aarch64_sve_cmla_lane_x : AdvSIMD_SVE_CMLA_LANE_Intrinsic; -def int_aarch64_sve_sqrdcmlah_x : AdvSIMD_SVE2_CMLA_Intrinsic; -def int_aarch64_sve_sqrdcmlah_lane_x : AdvSIMD_SVE_CMLA_LANE_Intrinsic; +def int_aarch64_sve_cadd_x : AdvSIMD_SVE2_CADD_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqcadd_x : AdvSIMD_SVE2_CADD_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmla_x : AdvSIMD_SVE2_CMLA_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmla_lane_x : AdvSIMD_SVE_CMLA_LANE_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrdcmlah_x : AdvSIMD_SVE2_CMLA_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrdcmlah_lane_x : AdvSIMD_SVE_CMLA_LANE_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Widening complex integer arithmetic // -def int_aarch64_sve_saddlbt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_ssublbt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_ssubltb : SVE2_2VectorArg_Long_Intrinsic; +def int_aarch64_sve_saddlbt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ssublbt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ssubltb : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Widening complex integer dot product // -def int_aarch64_sve_cdot : AdvSIMD_SVE_DOT_Indexed_Intrinsic; -def int_aarch64_sve_cdot_lane : AdvSIMD_SVE_CDOT_LANE_Intrinsic; +def int_aarch64_sve_cdot : AdvSIMD_SVE_DOT_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cdot_lane : AdvSIMD_SVE_CDOT_LANE_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Floating-point widening multiply-accumulate @@ -2586,137 +2585,137 @@ def int_aarch64_sve_flogb : AdvSIMD_SVE_LOGB_Intrinsic; // SVE2 - Vector histogram count // -def int_aarch64_sve_histcnt : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_histseg : AdvSIMD_2VectorArg_Intrinsic; +def int_aarch64_sve_histcnt : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_histseg : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Character match // -def int_aarch64_sve_match : AdvSIMD_SVE_Compare_Intrinsic; -def int_aarch64_sve_nmatch : AdvSIMD_SVE_Compare_Intrinsic; +def int_aarch64_sve_match : AdvSIMD_SVE_Compare_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_nmatch : AdvSIMD_SVE_Compare_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Unary narrowing operations // -def int_aarch64_sve_sqxtnb : SVE2_1VectorArg_Narrowing_Intrinsic; -def int_aarch64_sve_sqxtnt : SVE2_Merged1VectorArg_Narrowing_Intrinsic; -def int_aarch64_sve_sqxtunb : SVE2_1VectorArg_Narrowing_Intrinsic; -def int_aarch64_sve_sqxtunt : SVE2_Merged1VectorArg_Narrowing_Intrinsic; -def int_aarch64_sve_uqxtnb : SVE2_1VectorArg_Narrowing_Intrinsic; -def int_aarch64_sve_uqxtnt : SVE2_Merged1VectorArg_Narrowing_Intrinsic; +def int_aarch64_sve_sqxtnb : SVE2_1VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqxtnt : SVE2_Merged1VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqxtunb : SVE2_1VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqxtunt : SVE2_Merged1VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqxtnb : SVE2_1VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqxtnt : SVE2_Merged1VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Binary narrowing DSP operations // -def int_aarch64_sve_addhnb : SVE2_2VectorArg_Narrowing_Intrinsic; -def int_aarch64_sve_addhnt : SVE2_Merged2VectorArg_Narrowing_Intrinsic; +def int_aarch64_sve_addhnb : SVE2_2VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_addhnt : SVE2_Merged2VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_raddhnb : SVE2_2VectorArg_Narrowing_Intrinsic; -def int_aarch64_sve_raddhnt : SVE2_Merged2VectorArg_Narrowing_Intrinsic; +def int_aarch64_sve_raddhnb : SVE2_2VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_raddhnt : SVE2_Merged2VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_subhnb : SVE2_2VectorArg_Narrowing_Intrinsic; -def int_aarch64_sve_subhnt : SVE2_Merged2VectorArg_Narrowing_Intrinsic; +def int_aarch64_sve_subhnb : SVE2_2VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_subhnt : SVE2_Merged2VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_rsubhnb : SVE2_2VectorArg_Narrowing_Intrinsic; -def int_aarch64_sve_rsubhnt : SVE2_Merged2VectorArg_Narrowing_Intrinsic; +def int_aarch64_sve_rsubhnb : SVE2_2VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_rsubhnt : SVE2_Merged2VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; // Narrowing shift right -def int_aarch64_sve_shrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic; -def int_aarch64_sve_shrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic; +def int_aarch64_sve_shrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_shrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_rshrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic; -def int_aarch64_sve_rshrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic; +def int_aarch64_sve_rshrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_rshrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; // Saturating shift right - signed input/output -def int_aarch64_sve_sqshrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic; -def int_aarch64_sve_sqshrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic; +def int_aarch64_sve_sqshrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqshrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_sqrshrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic; -def int_aarch64_sve_sqrshrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic; +def int_aarch64_sve_sqrshrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrshrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; // Saturating shift right - unsigned input/output -def int_aarch64_sve_uqshrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic; -def int_aarch64_sve_uqshrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic; +def int_aarch64_sve_uqshrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqshrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_uqrshrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic; -def int_aarch64_sve_uqrshrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic; +def int_aarch64_sve_uqrshrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqrshrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; // Saturating shift right - signed input, unsigned output -def int_aarch64_sve_sqshrunb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic; -def int_aarch64_sve_sqshrunt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic; +def int_aarch64_sve_sqshrunb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqshrunt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_sqrshrunb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic; -def int_aarch64_sve_sqrshrunt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic; +def int_aarch64_sve_sqrshrunb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrshrunt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; // SVE2 MLA LANE. -def int_aarch64_sve_smlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_smlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_umlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_umlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_smlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_smlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_umlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_umlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_smullb_lane : SVE2_2VectorArgIndexed_Long_Intrinsic; -def int_aarch64_sve_smullt_lane : SVE2_2VectorArgIndexed_Long_Intrinsic; -def int_aarch64_sve_umullb_lane : SVE2_2VectorArgIndexed_Long_Intrinsic; -def int_aarch64_sve_umullt_lane : SVE2_2VectorArgIndexed_Long_Intrinsic; -def int_aarch64_sve_sqdmlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_sqdmlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_sqdmlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_sqdmlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_sqdmullb_lane : SVE2_2VectorArgIndexed_Long_Intrinsic; -def int_aarch64_sve_sqdmullt_lane : SVE2_2VectorArgIndexed_Long_Intrinsic; +def int_aarch64_sve_smlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smullb_lane : SVE2_2VectorArgIndexed_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smullt_lane : SVE2_2VectorArgIndexed_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umullb_lane : SVE2_2VectorArgIndexed_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umullt_lane : SVE2_2VectorArgIndexed_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmullb_lane : SVE2_2VectorArgIndexed_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmullt_lane : SVE2_2VectorArgIndexed_Long_Intrinsic<[IntrSpeculatable]>; // SVE2 MLA Unpredicated. -def int_aarch64_sve_smlalb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_smlalt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_umlalb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_umlalt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_smlslb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_smlslt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_umlslb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_umlslt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_smullb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_smullt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_umullb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_umullt : SVE2_2VectorArg_Long_Intrinsic; - -def int_aarch64_sve_sqdmlalb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_sqdmlalt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_sqdmlslb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_sqdmlslt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_sqdmullb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_sqdmullt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_sqdmlalbt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_sqdmlslbt : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_smlalb : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smlalt : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umlalb : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umlalt : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smlslb : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smlslt : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umlslb : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umlslt : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smullb : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smullt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umullb : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umullt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_sqdmlalb : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmlalt : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmlslb : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmlslt : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmullb : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmullt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmlalbt : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmlslbt : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; // SVE2 ADDSUB Long Unpredicated. -def int_aarch64_sve_adclb : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_adclt : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_sbclb : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_sbclt : AdvSIMD_3VectorArg_Intrinsic; +def int_aarch64_sve_adclb : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_adclt : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sbclb : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sbclt : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Polynomial arithmetic // -def int_aarch64_sve_eorbt : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_eortb : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_pmullb_pair : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_pmullt_pair : AdvSIMD_2VectorArg_Intrinsic; +def int_aarch64_sve_eorbt : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_eortb : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_pmullb_pair : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_pmullt_pair : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; // // SVE2 bitwise ternary operations. // -def int_aarch64_sve_eor3 : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_bcax : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_bsl : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_bsl1n : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_bsl2n : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_nbsl : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_xar : AdvSIMD_2VectorArgIndexed_Intrinsic; +def int_aarch64_sve_eor3 : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_bcax : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_bsl : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_bsl1n : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_bsl2n : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_nbsl : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_xar : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Optional AES, SHA-3 and SM4 @@ -2725,70 +2724,70 @@ def int_aarch64_sve_xar : AdvSIMD_2VectorArgIndexed_Intrinsic; def int_aarch64_sve_aesd : ClangBuiltin<"__builtin_sve_svaesd_u8">, DefaultAttrsIntrinsic<[llvm_nxv16i8_ty], [llvm_nxv16i8_ty, llvm_nxv16i8_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; def int_aarch64_sve_aesimc : ClangBuiltin<"__builtin_sve_svaesimc_u8">, DefaultAttrsIntrinsic<[llvm_nxv16i8_ty], [llvm_nxv16i8_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; def int_aarch64_sve_aese : ClangBuiltin<"__builtin_sve_svaese_u8">, DefaultAttrsIntrinsic<[llvm_nxv16i8_ty], [llvm_nxv16i8_ty, llvm_nxv16i8_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; def int_aarch64_sve_aesmc : ClangBuiltin<"__builtin_sve_svaesmc_u8">, DefaultAttrsIntrinsic<[llvm_nxv16i8_ty], [llvm_nxv16i8_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; def int_aarch64_sve_rax1 : ClangBuiltin<"__builtin_sve_svrax1_u64">, DefaultAttrsIntrinsic<[llvm_nxv2i64_ty], [llvm_nxv2i64_ty, llvm_nxv2i64_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; def int_aarch64_sve_sm4e : ClangBuiltin<"__builtin_sve_svsm4e_u32">, DefaultAttrsIntrinsic<[llvm_nxv4i32_ty], [llvm_nxv4i32_ty, llvm_nxv4i32_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; def int_aarch64_sve_sm4ekey : ClangBuiltin<"__builtin_sve_svsm4ekey_u32">, DefaultAttrsIntrinsic<[llvm_nxv4i32_ty], [llvm_nxv4i32_ty, llvm_nxv4i32_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; // // SVE2 - Extended table lookup/permute // -def int_aarch64_sve_tbl2 : AdvSIMD_SVE2_TBX_Intrinsic; -def int_aarch64_sve_tbx : AdvSIMD_SVE2_TBX_Intrinsic; +def int_aarch64_sve_tbl2 : AdvSIMD_SVE2_TBX_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_tbx : AdvSIMD_SVE2_TBX_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Lookup Table // -def int_aarch64_sve_luti2_lane : SVE2_LUTI_Inrinsic; -def int_aarch64_sve_luti4_lane : SVE2_LUTI_Inrinsic; +def int_aarch64_sve_luti2_lane : SVE2_LUTI_Inrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_luti4_lane : SVE2_LUTI_Inrinsic<[IntrSpeculatable]>; def int_aarch64_sve_luti4_lane_x2 : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_nxv16i8_ty, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + [IntrNoMem, ImmArg>, IntrSpeculatable]>; // // SVE2 - Optional bit permutation // -def int_aarch64_sve_bdep_x : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_bext_x : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_bgrp_x : AdvSIMD_2VectorArg_Intrinsic; +def int_aarch64_sve_bdep_x : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_bext_x : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_bgrp_x : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; // // SVE ACLE: 7.3. INT8 matrix multiply extensions // -def int_aarch64_sve_ummla : SVE_MatMul_Intrinsic; -def int_aarch64_sve_smmla : SVE_MatMul_Intrinsic; -def int_aarch64_sve_usmmla : SVE_MatMul_Intrinsic; +def int_aarch64_sve_ummla : SVE_MatMul_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smmla : SVE_MatMul_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_usmmla : SVE_MatMul_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_usdot : AdvSIMD_SVE_DOT_Intrinsic; -def int_aarch64_sve_usdot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic; -def int_aarch64_sve_sudot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic; +def int_aarch64_sve_usdot : AdvSIMD_SVE_DOT_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_usdot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sudot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic<[IntrSpeculatable]>; // // SVE ACLE: 7.4/5. FP64/FP32 matrix multiply extensions @@ -2885,14 +2884,14 @@ def int_aarch64_sve_stnt1_pn_x4 : SVE2p1_Store_PN_X4_Intrinsic; // SVE2 - Contiguous conflict detection // -def int_aarch64_sve_whilerw_b : SVE2_CONFLICT_DETECT_Intrinsic; -def int_aarch64_sve_whilerw_h : SVE2_CONFLICT_DETECT_Intrinsic; -def int_aarch64_sve_whilerw_s : SVE2_CONFLICT_DETECT_Intrinsic; -def int_aarch64_sve_whilerw_d : SVE2_CONFLICT_DETECT_Intrinsic; -def int_aarch64_sve_whilewr_b : SVE2_CONFLICT_DETECT_Intrinsic; -def int_aarch64_sve_whilewr_h : SVE2_CONFLICT_DETECT_Intrinsic; -def int_aarch64_sve_whilewr_s : SVE2_CONFLICT_DETECT_Intrinsic; -def int_aarch64_sve_whilewr_d : SVE2_CONFLICT_DETECT_Intrinsic; +def int_aarch64_sve_whilerw_b : SVE2_CONFLICT_DETECT_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilerw_h : SVE2_CONFLICT_DETECT_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilerw_s : SVE2_CONFLICT_DETECT_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilerw_d : SVE2_CONFLICT_DETECT_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilewr_b : SVE2_CONFLICT_DETECT_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilewr_h : SVE2_CONFLICT_DETECT_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilewr_s : SVE2_CONFLICT_DETECT_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilewr_d : SVE2_CONFLICT_DETECT_Intrinsic<[IntrSpeculatable]>; // Scalable Matrix Extension (SME) Intrinsics let TargetPrefix = "aarch64" in { @@ -3127,8 +3126,8 @@ let TargetPrefix = "aarch64" in { // Clamp // - def int_aarch64_sve_sclamp : AdvSIMD_3VectorArg_Intrinsic; - def int_aarch64_sve_uclamp : AdvSIMD_3VectorArg_Intrinsic; + def int_aarch64_sve_sclamp : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; + def int_aarch64_sve_uclamp : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; def int_aarch64_sve_fclamp : AdvSIMD_3VectorArg_Intrinsic; @@ -3136,7 +3135,7 @@ let TargetPrefix = "aarch64" in { // Reversal // - def int_aarch64_sve_revd : AdvSIMD_Merged1VectorArg_Intrinsic; + def int_aarch64_sve_revd : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; // // Predicate selection @@ -3837,11 +3836,11 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sve_uzpq_x4 : SVE2_VG4_ZipUzp_Intrinsic; // Vector dot-products (2-way) - def int_aarch64_sve_sdot_x2 : SVE2_3VectorArg_Long_Intrinsic; - def int_aarch64_sve_udot_x2 : SVE2_3VectorArg_Long_Intrinsic; + def int_aarch64_sve_sdot_x2 : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; + def int_aarch64_sve_udot_x2 : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; def int_aarch64_sve_fdot_x2 : SVE2_3VectorArg_Long_Intrinsic; - def int_aarch64_sve_sdot_lane_x2 : SVE2_3VectorArgIndexed_Long_Intrinsic; - def int_aarch64_sve_udot_lane_x2 : SVE2_3VectorArgIndexed_Long_Intrinsic; + def int_aarch64_sve_sdot_lane_x2 : SVE2_3VectorArgIndexed_Long_Intrinsic<[IntrSpeculatable]>; + def int_aarch64_sve_udot_lane_x2 : SVE2_3VectorArgIndexed_Long_Intrinsic<[IntrSpeculatable]>; def int_aarch64_sve_fdot_lane_x2 : SVE2_3VectorArgIndexed_Long_Intrinsic; // @@ -3932,30 +3931,30 @@ let TargetPrefix = "aarch64" in { // SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2 // -def int_aarch64_sve_zipq1 : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_zipq2 : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_uzpq1 : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_uzpq2 : AdvSIMD_2VectorArg_Intrinsic; +def int_aarch64_sve_zipq1 : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zipq2 : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzpq1 : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzpq2 : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; // SVE2.1 - Programmable table lookup within each quadword vector segment // (zeroing)/(merging) // -def int_aarch64_sve_tblq : AdvSIMD_SVE_TBL_Intrinsic; -def int_aarch64_sve_tbxq : AdvSIMD_SVE2_TBX_Intrinsic; +def int_aarch64_sve_tblq : AdvSIMD_SVE_TBL_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_tbxq : AdvSIMD_SVE2_TBX_Intrinsic<[IntrSpeculatable]>; // SVE2.1 - Extract vector segment from each pair of quadword segments. // -def int_aarch64_sve_extq : AdvSIMD_2VectorArgIndexed_Intrinsic; +def int_aarch64_sve_extq : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; // // SVE2.1 - Move predicate to/from vector // -def int_aarch64_sve_pmov_to_pred_lane : SVE2_1VectorArgIndexed_Pred_Intrinsic; +def int_aarch64_sve_pmov_to_pred_lane : SVE2_1VectorArgIndexed_Pred_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic; +def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_pmov_to_vector_lane_merging : SVE2_Pred_1VectorArgIndexed_Intrinsic; -def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic; +def int_aarch64_sve_pmov_to_vector_lane_merging : SVE2_Pred_1VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic<[IntrSpeculatable]>; def int_aarch64_sme_mopa_nonwide : SME_OuterProduct_Intrinsic; def int_aarch64_sme_mops_nonwide : SME_OuterProduct_Intrinsic; diff --git a/llvm/include/llvm/MC/MCParser/AsmLexer.h b/llvm/include/llvm/MC/MCParser/AsmLexer.h index c514b768637d1..7848fc706d5eb 100644 --- a/llvm/include/llvm/MC/MCParser/AsmLexer.h +++ b/llvm/include/llvm/MC/MCParser/AsmLexer.h @@ -44,6 +44,7 @@ class AsmLexer { SmallVector CurTok; const char *CurPtr = nullptr; + /// NULL-terminated buffer. NULL terminator must reside at `CurBuf.end()`. StringRef CurBuf; /// The location and description of the current error @@ -190,6 +191,12 @@ class AsmLexer { /// literals. void setLexHLASMStrings(bool V) { LexHLASMStrings = V; } + /// Set buffer to be lexed. + /// `Buf` must be NULL-terminated. NULL terminator must reside at `Buf.end()`. + /// `ptr` if provided must be in range [`Buf.begin()`, `buf.end()`] or NULL. + /// Specifies where lexing of buffer should begin. + /// `EndStatementAtEOF` specifies whether `AsmToken::EndOfStatement` should be + /// returned upon reaching end of buffer. LLVM_ABI void setBuffer(StringRef Buf, const char *ptr = nullptr, bool EndStatementAtEOF = true); diff --git a/llvm/include/llvm/Support/ModRef.h b/llvm/include/llvm/Support/ModRef.h index 71f3b5bcb9c2b..34f116e478966 100644 --- a/llvm/include/llvm/Support/ModRef.h +++ b/llvm/include/llvm/Support/ModRef.h @@ -66,10 +66,15 @@ enum class IRMemLocation { ErrnoMem = 2, /// Any other memory. Other = 3, + /// Represents target specific state. + TargetMem0 = 4, + TargetMem1 = 5, /// Helpers to iterate all locations in the MemoryEffectsBase class. First = ArgMem, - Last = Other, + FirstTarget = TargetMem0, + // TargetMem IDs must be at the end of the list. + Last = TargetMem1, }; template class MemoryEffectsBase { diff --git a/llvm/lib/Analysis/ScopedNoAliasAA.cpp b/llvm/lib/Analysis/ScopedNoAliasAA.cpp index 4d6c0cc71f898..d24ad0255256c 100644 --- a/llvm/lib/Analysis/ScopedNoAliasAA.cpp +++ b/llvm/lib/Analysis/ScopedNoAliasAA.cpp @@ -116,7 +116,7 @@ static void collectMDInDomain(const MDNode *List, const MDNode *Domain, /// Collect the set of scoped domains relevant to the noalias scopes. void ScopedNoAliasAAResult::collectScopedDomains( - const MDNode *NoAlias, SmallPtrSetImpl &Domains) const { + const MDNode *NoAlias, SmallPtrSetImpl &Domains) { if (!NoAlias) return; assert(Domains.empty() && "Domains should be empty"); @@ -127,7 +127,7 @@ void ScopedNoAliasAAResult::collectScopedDomains( } bool ScopedNoAliasAAResult::mayAliasInScopes(const MDNode *Scopes, - const MDNode *NoAlias) const { + const MDNode *NoAlias) { if (!Scopes || !NoAlias) return true; diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index a4d82a4c6f3b6..74906e9a0d7eb 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -708,6 +708,8 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(write); KEYWORD(readwrite); KEYWORD(argmem); + KEYWORD(target_mem0); + KEYWORD(target_mem1); KEYWORD(inaccessiblemem); KEYWORD(errnomem); KEYWORD(argmemonly); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index cd031be6c7022..43b1d8faf7201 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -2552,6 +2552,10 @@ static std::optional keywordToLoc(lltok::Kind Tok) { return IRMemLocation::InaccessibleMem; case lltok::kw_errnomem: return IRMemLocation::ErrnoMem; + case lltok::kw_target_mem0: + return IRMemLocation::TargetMem0; + case lltok::kw_target_mem1: + return IRMemLocation::TargetMem1; default: return std::nullopt; } diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index cacb292acee18..ba28e4dda3313 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -3439,6 +3439,18 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { Observer.changedInstr(MI); return Legalized; } + case TargetOpcode::G_LROUND: + case TargetOpcode::G_LLROUND: + Observer.changingInstr(MI); + + if (TypeIdx == 0) + widenScalarDst(MI, WideTy); + else + widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT); + + Observer.changedInstr(MI); + return Legalized; + case TargetOpcode::G_INTTOPTR: if (TypeIdx != 1) return UnableToLegalize; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 8e94b492487cf..e29f74934f24c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -13018,22 +13018,34 @@ SDValue DAGCombiner::visitPARTIAL_REDUCE_MLA(SDNode *N) { return SDValue(); } -// partial_reduce_*mla(acc, mul(ext(a), ext(b)), splat(1)) +// partial_reduce_*mla(acc, mul(*ext(a), *ext(b)), splat(1)) // -> partial_reduce_*mla(acc, a, b) // -// partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1)) -// -> partial_reduce_*mla(acc, x, C) +// partial_reduce_*mla(acc, mul(*ext(x), splat(C)), splat(1)) +// -> partial_reduce_*mla(acc, x, splat(C)) // -// partial_reduce_fmla(acc, fmul(fpext(a), fpext(b)), splat(1.0)) -// -> partial_reduce_fmla(acc, a, b) +// partial_reduce_*mla(acc, sel(p, mul(*ext(a), *ext(b)), splat(0)), splat(1)) +// -> partial_reduce_*mla(acc, sel(p, a, splat(0)), b) +// +// partial_reduce_*mla(acc, sel(p, mul(*ext(a), splat(C)), splat(0)), splat(1)) +// -> partial_reduce_*mla(acc, sel(p, a, splat(0)), splat(C)) SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { SDLoc DL(N); auto *Context = DAG.getContext(); SDValue Acc = N->getOperand(0); SDValue Op1 = N->getOperand(1); SDValue Op2 = N->getOperand(2); - unsigned Opc = Op1->getOpcode(); + + // Handle predication by moving the SELECT into the operand of the MUL. + SDValue Pred; + if (Opc == ISD::VSELECT && (isZeroOrZeroSplat(Op1->getOperand(2)) || + isZeroOrZeroSplatFP(Op1->getOperand(2)))) { + Pred = Op1->getOperand(0); + Op1 = Op1->getOperand(1); + Opc = Op1->getOpcode(); + } + if (Opc != ISD::MUL && Opc != ISD::FMUL && Opc != ISD::SHL) return SDValue(); @@ -13068,6 +13080,19 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { SDValue LHSExtOp = LHS->getOperand(0); EVT LHSExtOpVT = LHSExtOp.getValueType(); + // When Pred is non-zero, set Op = select(Pred, Op, splat(0)) and freeze + // OtherOp to keep the same semantics when moving the selects into the MUL + // operands. + auto ApplyPredicate = [&](SDValue &Op, SDValue &OtherOp) { + if (Pred) { + EVT OpVT = Op.getValueType(); + SDValue Zero = OpVT.isFloatingPoint() ? DAG.getConstantFP(0.0, DL, OpVT) + : DAG.getConstant(0, DL, OpVT); + Op = DAG.getSelect(DL, OpVT, Pred, Op, Zero); + OtherOp = DAG.getFreeze(OtherOp); + } + }; + // partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1)) // -> partial_reduce_*mla(acc, x, C) APInt C; @@ -13090,8 +13115,9 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { TLI.getTypeToTransformTo(*Context, LHSExtOpVT))) return SDValue(); - return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, LHSExtOp, - DAG.getConstant(CTrunc, DL, LHSExtOpVT)); + SDValue C = DAG.getConstant(CTrunc, DL, LHSExtOpVT); + ApplyPredicate(C, LHSExtOp); + return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, LHSExtOp, C); } unsigned RHSOpcode = RHS->getOpcode(); @@ -13132,17 +13158,17 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { TLI.getTypeToTransformTo(*Context, LHSExtOpVT))) return SDValue(); + ApplyPredicate(RHSExtOp, LHSExtOp); return DAG.getNode(NewOpc, DL, N->getValueType(0), Acc, LHSExtOp, RHSExtOp); } -// partial.reduce.umla(acc, zext(op), splat(1)) -// -> partial.reduce.umla(acc, op, splat(trunc(1))) -// partial.reduce.smla(acc, sext(op), splat(1)) -// -> partial.reduce.smla(acc, op, splat(trunc(1))) +// partial.reduce.*mla(acc, *ext(op), splat(1)) +// -> partial.reduce.*mla(acc, op, splat(trunc(1))) // partial.reduce.sumla(acc, sext(op), splat(1)) // -> partial.reduce.smla(acc, op, splat(trunc(1))) -// partial.reduce.fmla(acc, fpext(op), splat(1.0)) -// -> partial.reduce.fmla(acc, op, splat(1.0)) +// +// partial.reduce.*mla(acc, sel(p, *ext(op), splat(0)), splat(1)) +// -> partial.reduce.*mla(acc, sel(p, op, splat(0)), splat(trunc(1))) SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) { SDLoc DL(N); SDValue Acc = N->getOperand(0); @@ -13152,7 +13178,15 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) { if (!llvm::isOneOrOneSplat(Op2) && !llvm::isOneOrOneSplatFP(Op2)) return SDValue(); + SDValue Pred; unsigned Op1Opcode = Op1.getOpcode(); + if (Op1Opcode == ISD::VSELECT && (isZeroOrZeroSplat(Op1->getOperand(2)) || + isZeroOrZeroSplatFP(Op1->getOperand(2)))) { + Pred = Op1->getOperand(0); + Op1 = Op1->getOperand(1); + Op1Opcode = Op1->getOpcode(); + } + if (!ISD::isExtOpcode(Op1Opcode) && Op1Opcode != ISD::FP_EXTEND) return SDValue(); @@ -13181,6 +13215,12 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) { ? DAG.getConstantFP(1, DL, UnextOp1VT) : DAG.getConstant(1, DL, UnextOp1VT); + if (Pred) { + SDValue Zero = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA + ? DAG.getConstantFP(0, DL, UnextOp1VT) + : DAG.getConstant(0, DL, UnextOp1VT); + Constant = DAG.getSelect(DL, UnextOp1VT, Pred, Constant, Zero); + } return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, UnextOp1, Constant); } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index cad66c2b0d381..9801581f506e6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -13000,6 +13000,11 @@ bool llvm::isZeroOrZeroSplat(SDValue N, bool AllowUndefs) { return C && C->isZero(); } +bool llvm::isZeroOrZeroSplatFP(SDValue N, bool AllowUndefs) { + ConstantFPSDNode *C = isConstOrConstSplatFP(N, AllowUndefs); + return C && C->isZero(); +} + HandleSDNode::~HandleSDNode() { DropOperands(); } diff --git a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt index db16a3005f6c1..41402f7a69ccb 100644 --- a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt +++ b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt @@ -14,7 +14,6 @@ add_llvm_component_library(LLVMOrcJIT CompileOnDemandLayer.cpp CompileUtils.cpp Core.cpp - DebugObjectManagerPlugin.cpp DebugUtils.cpp EHFrameRegistrationPlugin.cpp EPCDynamicLibrarySearchGenerator.cpp diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/Debugging/CMakeLists.txt index 186df5dad072e..ab287c7af60be 100644 --- a/llvm/lib/ExecutionEngine/Orc/Debugging/CMakeLists.txt +++ b/llvm/lib/ExecutionEngine/Orc/Debugging/CMakeLists.txt @@ -6,6 +6,7 @@ add_llvm_component_library(LLVMOrcDebugging DebugInfoSupport.cpp DebuggerSupport.cpp DebuggerSupportPlugin.cpp + ELFDebugObjectPlugin.cpp LLJITUtilsCBindings.cpp PerfSupportPlugin.cpp VTuneSupportPlugin.cpp diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupport.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupport.cpp index 06667869b4803..7be58871ff57b 100644 --- a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupport.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupport.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include "llvm/ExecutionEngine/Orc/Debugging/DebuggerSupport.h" -#include "llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h" #include "llvm/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.h" +#include "llvm/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.h" #include "llvm/ExecutionEngine/Orc/LLJIT.h" #define DEBUG_TYPE "orc" @@ -36,8 +36,8 @@ Error enableDebuggerSupport(LLJIT &J) { switch (TT.getObjectFormat()) { case Triple::ELF: { Error TargetSymErr = Error::success(); - ObjLinkingLayer->addPlugin(std::make_unique( - ES, false, true, TargetSymErr)); + ObjLinkingLayer->addPlugin( + std::make_unique(ES, false, true, TargetSymErr)); return TargetSymErr; } case Triple::MachO: { diff --git a/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.cpp similarity index 94% rename from llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp rename to llvm/lib/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.cpp index d183134f3b769..9f556b0d07a8b 100644 --- a/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.cpp @@ -1,4 +1,4 @@ -//===------- DebugObjectManagerPlugin.cpp - JITLink debug objects ---------===// +//===------- ELFDebugObjectPlugin.cpp - JITLink debug objects ---------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -11,7 +11,7 @@ // //===----------------------------------------------------------------------===// -#include "llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h" +#include "llvm/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringMap.h" @@ -406,10 +406,9 @@ createDebugObjectFromBuffer(ExecutionSession &ES, LinkGraph &G, } } -DebugObjectManagerPlugin::DebugObjectManagerPlugin(ExecutionSession &ES, - bool RequireDebugSections, - bool AutoRegisterCode, - Error &Err) +ELFDebugObjectPlugin::ELFDebugObjectPlugin(ExecutionSession &ES, + bool RequireDebugSections, + bool AutoRegisterCode, Error &Err) : ES(ES), RequireDebugSections(RequireDebugSections), AutoRegisterCode(AutoRegisterCode) { // Pass bootstrap symbol for registration function to enable debugging @@ -418,9 +417,9 @@ DebugObjectManagerPlugin::DebugObjectManagerPlugin(ExecutionSession &ES, {{RegistrationAction, rt::RegisterJITLoaderGDBAllocActionName}}); } -DebugObjectManagerPlugin::~DebugObjectManagerPlugin() = default; +ELFDebugObjectPlugin::~ELFDebugObjectPlugin() = default; -void DebugObjectManagerPlugin::notifyMaterializing( +void ELFDebugObjectPlugin::notifyMaterializing( MaterializationResponsibility &MR, LinkGraph &G, JITLinkContext &Ctx, MemoryBufferRef ObjBuffer) { std::lock_guard Lock(PendingObjsLock); @@ -443,9 +442,9 @@ void DebugObjectManagerPlugin::notifyMaterializing( } } -void DebugObjectManagerPlugin::modifyPassConfig( - MaterializationResponsibility &MR, LinkGraph &G, - PassConfiguration &PassConfig) { +void ELFDebugObjectPlugin::modifyPassConfig(MaterializationResponsibility &MR, + LinkGraph &G, + PassConfiguration &PassConfig) { // Not all link artifacts have associated debug objects. std::lock_guard Lock(PendingObjsLock); auto It = PendingObjs.find(&MR); @@ -507,16 +506,15 @@ void DebugObjectManagerPlugin::modifyPassConfig( } } -Error DebugObjectManagerPlugin::notifyFailed( - MaterializationResponsibility &MR) { +Error ELFDebugObjectPlugin::notifyFailed(MaterializationResponsibility &MR) { std::lock_guard Lock(PendingObjsLock); PendingObjs.erase(&MR); return Error::success(); } -void DebugObjectManagerPlugin::notifyTransferringResources(JITDylib &JD, - ResourceKey DstKey, - ResourceKey SrcKey) { +void ELFDebugObjectPlugin::notifyTransferringResources(JITDylib &JD, + ResourceKey DstKey, + ResourceKey SrcKey) { // Debug objects are stored by ResourceKey only after registration. // Thus, pending objects don't need to be updated here. std::lock_guard Lock(RegisteredObjsLock); @@ -530,8 +528,8 @@ void DebugObjectManagerPlugin::notifyTransferringResources(JITDylib &JD, } } -Error DebugObjectManagerPlugin::notifyRemovingResources(JITDylib &JD, - ResourceKey Key) { +Error ELFDebugObjectPlugin::notifyRemovingResources(JITDylib &JD, + ResourceKey Key) { // Removing the resource for a pending object fails materialization, so they // get cleaned up in the notifyFailed() handler. std::lock_guard Lock(RegisteredObjsLock); diff --git a/llvm/lib/ExecutionEngine/Orc/MachO.cpp b/llvm/lib/ExecutionEngine/Orc/MachO.cpp index 731d24d1272d4..6b0f96da19dc6 100644 --- a/llvm/lib/ExecutionEngine/Orc/MachO.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MachO.cpp @@ -282,7 +282,7 @@ Expected ForceLoadMachOArchiveMembers::operator()( return true; } -LLVM_ABI SmallVector> +SmallVector> noFallbackArchs(uint32_t CPUType, uint32_t CPUSubType) { SmallVector> Result; Result.push_back({CPUType, CPUSubType}); diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp index 65815cbd70f32..690e4e29a3929 100644 --- a/llvm/lib/IR/Attributes.cpp +++ b/llvm/lib/IR/Attributes.cpp @@ -660,6 +660,12 @@ std::string Attribute::getAsString(bool InAttrGrp) const { break; case IRMemLocation::Other: llvm_unreachable("This is represented as the default access kind"); + case IRMemLocation::TargetMem0: + OS << "target_mem0: "; + break; + case IRMemLocation::TargetMem1: + OS << "target_mem1: "; + break; } OS << getModRefStr(MR); } diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp index 1af4a297babaa..8e4b7be98bdb6 100644 --- a/llvm/lib/MC/MCParser/AsmLexer.cpp +++ b/llvm/lib/MC/MCParser/AsmLexer.cpp @@ -119,6 +119,11 @@ AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) { void AsmLexer::setBuffer(StringRef Buf, const char *ptr, bool EndStatementAtEOF) { + // Buffer must be NULL-terminated. NULL terminator must reside at `Buf.end()`. + // It must be safe to dereference `Buf.end()`. + assert(*Buf.end() == '\0' && + "Buffer provided to AsmLexer lacks null terminator."); + CurBuf = Buf; if (ptr) diff --git a/llvm/lib/Support/ModRef.cpp b/llvm/lib/Support/ModRef.cpp index 2bb9bc945bd2e..1083c72902c0b 100644 --- a/llvm/lib/Support/ModRef.cpp +++ b/llvm/lib/Support/ModRef.cpp @@ -49,6 +49,12 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, MemoryEffects ME) { case IRMemLocation::Other: OS << "Other: "; break; + case IRMemLocation::TargetMem0: + OS << "TargetMem0: "; + break; + case IRMemLocation::TargetMem1: + OS << "TargetMem1: "; + break; } OS << ME.getModRef(Loc); }); diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index f63981b87c1c1..34d74d04c4419 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1063,6 +1063,7 @@ AArch64ExpandPseudo::expandCommitZASave(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) { MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); + [[maybe_unused]] auto *RI = MBB.getParent()->getSubtarget().getRegisterInfo(); // Compare TPIDR2_EL0 against 0. Commit ZA if TPIDR2_EL0 is non-zero. MachineInstrBuilder Branch = @@ -1073,21 +1074,25 @@ AArch64ExpandPseudo::expandCommitZASave(MachineBasicBlock &MBB, MachineInstrBuilder MIB = BuildMI(CondBB, CondBB.back(), DL, TII->get(AArch64::BL)); // Copy operands (mainly the regmask) from the pseudo. - for (unsigned I = 2; I < MI.getNumOperands(); ++I) + for (unsigned I = 3; I < MI.getNumOperands(); ++I) MIB.add(MI.getOperand(I)); // Clear TPIDR2_EL0. BuildMI(CondBB, CondBB.back(), DL, TII->get(AArch64::MSR)) .addImm(AArch64SysReg::TPIDR2_EL0) .addReg(AArch64::XZR); bool ZeroZA = MI.getOperand(1).getImm() != 0; + bool ZeroZT0 = MI.getOperand(2).getImm() != 0; if (ZeroZA) { - [[maybe_unused]] auto *TRI = - MBB.getParent()->getSubtarget().getRegisterInfo(); - assert(MI.definesRegister(AArch64::ZAB0, TRI) && "should define ZA!"); + assert(MI.definesRegister(AArch64::ZAB0, RI) && "should define ZA!"); BuildMI(CondBB, CondBB.back(), DL, TII->get(AArch64::ZERO_M)) .addImm(ZERO_ALL_ZA_MASK) .addDef(AArch64::ZAB0, RegState::ImplicitDefine); } + if (ZeroZT0) { + assert(MI.definesRegister(AArch64::ZT0, RI) && "should define ZT0!"); + BuildMI(CondBB, CondBB.back(), DL, TII->get(AArch64::ZERO_T)) + .addDef(AArch64::ZT0); + } MI.eraseFromParent(); return &EndBB; diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 5bb70ee11b06d..737169253ddb3 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -108,7 +108,8 @@ def SMEStateAllocPseudo : Pseudo<(outs), (ins), []>, Sched<[]>; def CommitZASavePseudo : Pseudo<(outs), - (ins GPR64:$tpidr2_el0, i1imm:$zero_za, i64imm:$commit_routine, variable_ops), []>, + (ins GPR64:$tpidr2_el0, i1imm:$zero_za, i1imm:$zero_zt0, + i64imm:$commit_routine, variable_ops), []>, Sched<[]>; def AArch64_inout_za_use diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td index 1dd132e9a7301..cb098751fd74d 100644 --- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td +++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td @@ -2602,14 +2602,14 @@ foreach n=0-15 in { //===----------------------------------------------------------------------===// // GIC -class GIC op1, bits<4> crn, bits<4> crm, bits<3> op2> { +class GIC op1, bits<4> crn, bits<4> crm, bits<3> op2, bit needsreg = 1> { string Name = name; bits<14> Encoding; let Encoding{13-11} = op1; let Encoding{10-7} = crn; let Encoding{6-3} = crm; let Encoding{2-0} = op2; - bit NeedsReg = 1; + bit NeedsReg = needsreg; string RequiresStr = [{ {AArch64::FeatureGCIE} }]; } @@ -2686,12 +2686,12 @@ def : GSB<"ack", 0b000, 0b1100, 0b0000, 0b001>; def : GICR<"cdia", 0b000, 0b1100, 0b0011, 0b000>; def : GICR<"cdnmia", 0b000, 0b1100, 0b0011, 0b001>; -// Op1 CRn CRm Op2 +// Op1 CRn CRm Op2, needsreg def : GIC<"cdaff", 0b000, 0b1100, 0b0001, 0b011>; def : GIC<"cddi", 0b000, 0b1100, 0b0010, 0b000>; def : GIC<"cddis", 0b000, 0b1100, 0b0001, 0b000>; def : GIC<"cden", 0b000, 0b1100, 0b0001, 0b001>; -def : GIC<"cdeoi", 0b000, 0b1100, 0b0001, 0b111>; +def : GIC<"cdeoi", 0b000, 0b1100, 0b0001, 0b111, 0>; def : GIC<"cdhm", 0b000, 0b1100, 0b0010, 0b001>; def : GIC<"cdpend", 0b000, 0b1100, 0b0001, 0b100>; def : GIC<"cdpri", 0b000, 0b1100, 0b0001, 0b010>; diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 5cc39319d71c0..433cb0387c470 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -4111,7 +4111,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, setRequiredFeatureString(GIC->getRequiredFeatures(), Str); return TokError(Str); } - ExpectRegister = true; + ExpectRegister = GIC->NeedsReg; createSysAlias(GIC->Encoding, Operands, S); } else if (Mnemonic == "gsb") { const AArch64GSB::GSB *GSB = AArch64GSB::lookupGSBByName(Op); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index a88817c9d2d19..fdf69b04bf676 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -449,10 +449,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .minScalar(0, s32) .libcallFor({{s32, s32}, {s64, s32}, {s128, s32}}); - // TODO: Libcall support for s128. - // TODO: s16 should be legal with full FP16 support. getActionDefinitionsBuilder({G_LROUND, G_LLROUND}) - .legalFor({{s64, s32}, {s64, s64}}); + .legalFor({{s64, s32}, {s64, s64}}) + .legalFor(HasFP16, {{s64, s16}}) + .minScalar(0, s64) + .minScalar(1, s32) + .libcallFor({{s64, s128}}); // TODO: Custom legalization for mismatched types. getActionDefinitionsBuilder(G_FCOPYSIGN) diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index bbc34ad35296c..3e4c1101fb8e1 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -1034,7 +1034,7 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, if (!GIC || !GIC->haveFeatures(STI.getFeatureBits())) return false; - NeedsReg = true; + NeedsReg = GIC->NeedsReg; Ins = "gic\t"; Name = std::string(GIC->Name); } else { diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp index 24d30c731b945..2afbec92392f0 100644 --- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp @@ -842,6 +842,7 @@ void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB, BuildMI(MBB, MBBI, DL, TII->get(AArch64::CommitZASavePseudo)) .addReg(TPIDR2EL0) .addImm(ZeroZA ? 1 : 0) + .addImm(/*ZeroZT0=*/false) .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_TPIDR2_SAVE)) .addRegMask(TRI->SMEABISupportRoutinesCallPreservedMaskFromX0()); if (ZeroZA) diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 49aba39872138..bf04c7fa132c0 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -404,7 +404,7 @@ class SICacheControl { /// Generates code sequences for the memory model of all GFX targets below /// GFX10. -class SIGfx6CacheControl : public SICacheControl { +class SIGfx6CacheControl final : public SICacheControl { public: SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} @@ -443,14 +443,27 @@ class SIGfx6CacheControl : public SICacheControl { Position Pos) const override; }; -class SIGfx10CacheControl : public SIGfx6CacheControl { +/// Generates code sequences for the memory model of GFX10/11. +class SIGfx10CacheControl final : public SICacheControl { public: - SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} + SIGfx10CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const override; + bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override { + return false; + } + + bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override { + return false; + } + bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsVolatile, bool IsNonTemporal, @@ -463,23 +476,17 @@ class SIGfx10CacheControl : public SIGfx6CacheControl { bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const override; -}; - -class SIGfx11CacheControl : public SIGfx10CacheControl { -public: - SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {} - bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const override; - - bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, - SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, - bool IsLastUse) const override; + bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, + Position Pos) const override { + return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, + IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release, + /*AtomicsOnly=*/false); + } }; -class SIGfx12CacheControl : public SIGfx11CacheControl { +class SIGfx12CacheControl final : public SICacheControl { protected: // Sets TH policy to \p Value if CPol operand is present in instruction \p MI. // \returns Returns true if \p MI is modified, false otherwise. @@ -504,7 +511,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const; public: - SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) { + SIGfx12CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) { // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases // the behavior is the same if assuming GFX12.0 in CU mode. assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled()); @@ -915,10 +922,8 @@ std::unique_ptr SICacheControl::create(const GCNSubtarget &ST) { GCNSubtarget::Generation Generation = ST.getGeneration(); if (Generation < AMDGPUSubtarget::GFX10) return std::make_unique(ST); - if (Generation < AMDGPUSubtarget::GFX11) - return std::make_unique(ST); if (Generation < AMDGPUSubtarget::GFX12) - return std::make_unique(ST); + return std::make_unique(ST); return std::make_unique(ST); } @@ -1438,8 +1443,7 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, } bool SIGfx10CacheControl::enableLoadCacheBypass( - const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, + const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { assert(MI->mayLoad() && !MI->mayStore()); bool Changed = false; @@ -1450,7 +1454,9 @@ bool SIGfx10CacheControl::enableLoadCacheBypass( case SIAtomicScope::AGENT: // Set the L0 and L1 cache policies to MISS_EVICT. // Note: there is no L2 cache coherent bypass control at the ISA level. - Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC); + // For GFX10, set GLC+DLC, for GFX11, only set GLC. + Changed |= + enableCPolBits(MI, CPol::GLC | (AMDGPU::isGFX10(ST) ? CPol::DLC : 0)); break; case SIAtomicScope::WORKGROUP: // In WGP mode the waves of a work-group can be executing on either CU of @@ -1504,6 +1510,10 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC); } + // GFX11: Set MALL NOALLOC for both load and store instructions. + if (AMDGPU::isGFX11(ST)) + Changed |= enableCPolBits(MI, CPol::DLC); + // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not // request cross address space as only the global address space can be @@ -1524,6 +1534,10 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( Changed |= enableCPolBits(MI, CPol::GLC); Changed |= enableCPolBits(MI, CPol::SLC); + // GFX11: Set MALL NOALLOC for both load and store instructions. + if (AMDGPU::isGFX11(ST)) + Changed |= enableCPolBits(MI, CPol::DLC); + return Changed; } @@ -1722,102 +1736,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return Changed; } -bool SIGfx11CacheControl::enableLoadCacheBypass( - const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const { - assert(MI->mayLoad() && !MI->mayStore()); - bool Changed = false; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - case SIAtomicScope::AGENT: - // Set the L0 and L1 cache policies to MISS_EVICT. - // Note: there is no L2 cache coherent bypass control at the ISA level. - Changed |= enableCPolBits(MI, CPol::GLC); - break; - case SIAtomicScope::WORKGROUP: - // In WGP mode the waves of a work-group can be executing on either CU of - // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in - // CU mode all waves of a work-group are on the same CU, and so the L0 - // does not need to be bypassed. - if (!ST.isCuModeEnabled()) - Changed |= enableCPolBits(MI, CPol::GLC); - break; - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No cache to bypass. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - /// The scratch address space does not need the global memory caches - /// to be bypassed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - return Changed; -} - -bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( - MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { - - // Only handle load and store, not atomic read-modify-write insructions. The - // latter use glc to indicate if the atomic returns a result and so must not - // be used for cache control. - assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); - - // Only update load and store, not LLVM IR atomic read-modify-write - // instructions. The latter are always marked as volatile so cannot sensibly - // handle it as do not want to pessimize all atomics. Also they do not support - // the nontemporal attribute. - assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); - - bool Changed = false; - - if (IsVolatile) { - // Set L0 and L1 cache policy to be MISS_EVICT for load instructions - // and MISS_LRU for store instructions. - // Note: there is no L2 cache coherent bypass control at the ISA level. - if (Op == SIMemOp::LOAD) - Changed |= enableCPolBits(MI, CPol::GLC); - - // Set MALL NOALLOC for load and store instructions. - Changed |= enableCPolBits(MI, CPol::DLC); - - // Ensure operation has completed at system scope to cause all volatile - // operations to be visible outside the program in a global order. Do not - // request cross address space as only the global address space can be - // observable outside the program, so no need to cause a waitcnt for LDS - // address space operations. - Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered, - /*AtomicsOnly=*/false); - return Changed; - } - - if (IsNonTemporal) { - // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT - // and L2 cache policy to STREAM. - // For stores setting both GLC and SLC configures L0 and L1 cache policy - // to MISS_EVICT and the L2 cache policy to STREAM. - if (Op == SIMemOp::STORE) - Changed |= enableCPolBits(MI, CPol::GLC); - Changed |= enableCPolBits(MI, CPol::SLC); - - // Set MALL NOALLOC for load and store instructions. - Changed |= enableCPolBits(MI, CPol::DLC); - return Changed; - } - - return Changed; -} - bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI, AMDGPU::CPol::CPol Value) const { MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index f7fc9528920a6..75e7cf347e461 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -2507,3 +2507,12 @@ void RISCVFrameLowering::inlineStackProbe(MachineFunction &MF, } } } + +int RISCVFrameLowering::getInitialCFAOffset(const MachineFunction &MF) const { + return 0; +} + +Register +RISCVFrameLowering::getInitialCFARegister(const MachineFunction &MF) const { + return RISCV::X2; +} diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h index 6af63a4885f35..87980dfb09f96 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h @@ -23,6 +23,9 @@ class RISCVFrameLowering : public TargetFrameLowering { public: explicit RISCVFrameLowering(const RISCVSubtarget &STI); + int getInitialCFAOffset(const MachineFunction &MF) const override; + Register getInitialCFARegister(const MachineFunction &MF) const override; + void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index fb298ee35d6c2..921d12757d672 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2535,7 +2535,7 @@ bool RISCVTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, // TODO: For sizes which aren't multiples of VLEN sizes, this may not be // a cheap extract. However, this case is important in practice for // shuffled extracts of longer vectors. How resolve? - return (ResElts * 2) == SrcElts && (Index == 0 || Index == ResElts); + return (ResElts * 2) == SrcElts && Index == ResElts; } MVT RISCVTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 16ef67da83128..911bd7ee2876f 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -103,6 +103,11 @@ static cl::opt cl::desc("Enable Machine Pipeliner for RISC-V"), cl::init(false), cl::Hidden); +static cl::opt EnableCFIInstrInserter( + "riscv-enable-cfi-instr-inserter", + cl::desc("Enable CFI Instruction Inserter for RISC-V"), cl::init(false), + cl::Hidden); + extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { RegisterTargetMachine X(getTheRISCV32Target()); RegisterTargetMachine Y(getTheRISCV64Target()); @@ -169,7 +174,7 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT, if (TT.isOSFuchsia() && !TT.isArch64Bit()) report_fatal_error("Fuchsia is only supported for 64-bit"); - setCFIFixup(true); + setCFIFixup(!EnableCFIInstrInserter); } const RISCVSubtarget * @@ -578,6 +583,9 @@ void RISCVPassConfig::addPreEmitPass2() { addPass(createUnpackMachineBundles([&](const MachineFunction &MF) { return MF.getFunction().getParent()->getModuleFlag("kcfi"); })); + + if (EnableCFIInstrInserter) + addPass(createCFIInstrInserter()); } void RISCVPassConfig::addMachineSSAOptimization() { diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index 0a8838cbd45c7..c742b92416362 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -62,7 +62,7 @@ struct DemandedVL { }; class RISCVVLOptimizer : public MachineFunctionPass { - const MachineRegisterInfo *MRI; + MachineRegisterInfo *MRI; const MachineDominatorTree *MDT; const TargetInstrInfo *TII; @@ -1392,6 +1392,42 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const { return true; } +/// Given a vslidedown.vx like: +/// +/// %slideamt = ADDI %x, -1 +/// %v = PseudoVSLIDEDOWN_VX %passthru, %src, %slideamt, avl=1 +/// +/// %v will only read the first %slideamt + 1 lanes of %src, which = %x. +/// This is a common case when lowering extractelement. +/// +/// Note that if %x is 0, %slideamt will be all ones. In this case %src will be +/// completely slid down and none of its lanes will be read (since %slideamt is +/// greater than the largest VLMAX of 65536) so we can demand any minimum VL. +static std::optional +getMinimumVLForVSLIDEDOWN_VX(const MachineOperand &UserOp, + const MachineRegisterInfo *MRI) { + const MachineInstr &MI = *UserOp.getParent(); + if (RISCV::getRVVMCOpcode(MI.getOpcode()) != RISCV::VSLIDEDOWN_VX) + return std::nullopt; + // We're looking at what lanes are used from the src operand. + if (UserOp.getOperandNo() != 2) + return std::nullopt; + // For now, the AVL must be 1. + const MachineOperand &AVL = MI.getOperand(4); + if (!AVL.isImm() || AVL.getImm() != 1) + return std::nullopt; + // The slide amount must be %x - 1. + const MachineOperand &SlideAmt = MI.getOperand(3); + if (!SlideAmt.getReg().isVirtual()) + return std::nullopt; + MachineInstr *SlideAmtDef = MRI->getUniqueVRegDef(SlideAmt.getReg()); + if (SlideAmtDef->getOpcode() != RISCV::ADDI || + SlideAmtDef->getOperand(2).getImm() != -AVL.getImm() || + !SlideAmtDef->getOperand(1).getReg().isVirtual()) + return std::nullopt; + return SlideAmtDef->getOperand(1); +} + DemandedVL RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const { const MachineInstr &UserMI = *UserOp.getParent(); @@ -1406,6 +1442,9 @@ RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const { return DemandedVL::vlmax(); } + if (auto VL = getMinimumVLForVSLIDEDOWN_VX(UserOp, MRI)) + return *VL; + if (RISCVII::readsPastVL( TII->get(RISCV::getRVVMCOpcode(UserMI.getOpcode())).TSFlags)) { LLVM_DEBUG(dbgs() << " Abort because used by unsafe instruction\n"); @@ -1624,6 +1663,7 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const { // All our checks passed. We can reduce VL. VLOp.ChangeToRegister(CommonVL->getReg(), false); + MRI->constrainRegClass(CommonVL->getReg(), &RISCV::GPRNoX0RegClass); return true; } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 14097d7b40a9c..0bdddcffd723d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1541,6 +1541,32 @@ def : Pat<(v4i32 (int_wasm_extadd_pairwise_signed (v8i16 V128:$in))), def : Pat<(v8i16 (int_wasm_extadd_pairwise_signed (v16i8 V128:$in))), (extadd_pairwise_s_I16x8 V128:$in)>; +multiclass ExtAddPairwiseShuffle { + foreach sign = ["s", "u"] in { + def : Pat<(to_ty (add + (!cast("extend_low_"#sign) (from_ty (wasm_shuffle (from_ty V128:$vec), (from_ty srcvalue), + (i32 a0), (i32 a1), (i32 a2), (i32 a3), + (i32 a4), (i32 a5), (i32 a6), (i32 a7), + (i32 srcvalue), (i32 srcvalue), (i32 srcvalue), (i32 srcvalue), + (i32 srcvalue), (i32 srcvalue), (i32 srcvalue), (i32 srcvalue)))), + (!cast("extend_low_"#sign) (from_ty (wasm_shuffle (from_ty V128:$vec), (from_ty srcvalue), + (i32 b0), (i32 b1), (i32 b2), (i32 b3), + (i32 b4), (i32 b5), (i32 b6), (i32 b7), + (i32 srcvalue), (i32 srcvalue), (i32 srcvalue), (i32 srcvalue), + (i32 srcvalue), (i32 srcvalue), (i32 srcvalue), (i32 srcvalue)))))), + (!cast("extadd_pairwise_"#sign#"_"#suffix) V128:$vec)>; + } +} + +defm : ExtAddPairwiseShuffle; +defm : ExtAddPairwiseShuffle; + // f64x2 <-> f32x4 conversions def demote_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>; def demote_zero : SDNode<"WebAssemblyISD::DEMOTE_ZERO", demote_t>; diff --git a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp index c0b9339e9bc34..b07ce2b958fa0 100644 --- a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp +++ b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp @@ -280,8 +280,7 @@ bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, if (Arg.hasAttribute(Attribute::ByVal) || Arg.hasAttribute(Attribute::InReg) || Arg.hasAttribute(Attribute::SwiftSelf) || - Arg.hasAttribute(Attribute::SwiftError) || - Arg.hasAttribute(Attribute::Nest) || VRegs[Idx].size() > 1) + Arg.hasAttribute(Attribute::SwiftError) || VRegs[Idx].size() > 1) return false; if (Arg.hasAttribute(Attribute::StructRet)) { diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index ff7149044d199..deb8ee2d88055 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -21479,7 +21479,18 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, "new ScheduleData already in scheduling region"); SD->init(SchedulingRegionID, I); + auto CanIgnoreLoad = [](const Instruction *I) { + const auto *LI = dyn_cast(I); + // If there is a simple load marked as invariant, we can ignore it. + // But, in the (unlikely) case of non-simple invariant load, + // we should not ignore it. + return LI && LI->isSimple() && + LI->getMetadata(LLVMContext::MD_invariant_load); + }; + if (I->mayReadOrWriteMemory() && + // Simple InvariantLoad does not depend on other memory accesses. + !CanIgnoreLoad(I) && (!isa(I) || (cast(I)->getIntrinsicID() != Intrinsic::sideeffect && cast(I)->getIntrinsicID() != diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index c81834e401726..fc29ab0c84093 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -32,6 +32,7 @@ #include "llvm/ADT/ilist.h" #include "llvm/ADT/ilist_node.h" #include "llvm/Analysis/IVDescriptors.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/FMF.h" @@ -981,6 +982,13 @@ class VPIRMetadata { /// Intersect this VPIRMetada object with \p MD, keeping only metadata /// nodes that are common to both. void intersect(const VPIRMetadata &MD); + + /// Get metadata of kind \p Kind. Returns nullptr if not found. + MDNode *getMetadata(unsigned Kind) const { + auto It = + find_if(Metadata, [Kind](const auto &P) { return P.first == Kind; }); + return It != Metadata.end() ? It->second : nullptr; + } }; /// This is a concrete Recipe that models a single VPlan-level instruction. @@ -3868,6 +3876,75 @@ template <> struct CastInfo : CastInfoVPPhiAccessors {}; +/// Casting from (const) VPRecipeBase -> (const) VPIRMetadata is supported for +/// all recipe types implementing VPIRMetadata. Used by isa<> & co. +namespace detail { +template +static inline auto castToVPIRMetadata(RecipeBasePtrTy R) -> DstTy { + switch (R->getVPDefID()) { + case VPDef::VPInstructionSC: + return cast(R); + case VPDef::VPWidenSC: + return cast(R); + case VPDef::VPWidenCastSC: + return cast(R); + case VPDef::VPWidenIntrinsicSC: + return cast(R); + case VPDef::VPWidenCallSC: + return cast(R); + case VPDef::VPWidenSelectSC: + return cast(R); + case VPDef::VPReplicateSC: + return cast(R); + case VPDef::VPInterleaveSC: + case VPDef::VPInterleaveEVLSC: + return cast(R); + case VPDef::VPWidenLoadSC: + case VPDef::VPWidenLoadEVLSC: + case VPDef::VPWidenStoreSC: + case VPDef::VPWidenStoreEVLSC: + return cast(R); + default: + llvm_unreachable("invalid recipe for VPIRMetadata cast"); + } +} +} // namespace detail + +/// Support casting from VPRecipeBase -> VPIRMetadata, by down-casting to the +/// recipe types implementing VPIRMetadata. Used by cast<>, dyn_cast<> & co. +template +struct CastInfoVPIRMetadata : public CastIsPossible { + static inline bool isPossible(SrcTy R) { + // NOTE: Each recipe inheriting from VPIRMetadata must be listed here and + // also handled in castToVPIRMetadata. + return isa(R); + } + + using RetTy = DstTy *; + + /// doCast is used by cast<>. + static inline RetTy doCast(SrcTy R) { + return detail::castToVPIRMetadata(R); + } + + /// doCastIfPossible is used by dyn_cast<>. + static inline RetTy doCastIfPossible(SrcTy R) { + if (!isPossible(R)) + return nullptr; + return doCast(R); + } +}; +template <> +struct CastInfo + : CastInfoVPIRMetadata {}; +template <> +struct CastInfo + : CastInfoVPIRMetadata {}; + /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It /// holds a sequence of zero or more VPRecipe's each representing a sequence of /// output IR instructions. All PHI-like recipes must come before any non-PHI recipes. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 89118b49bed44..26563242de283 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -24,15 +24,20 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/InstSimplifyFolder.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ScalarEvolutionPatternMatch.h" +#include "llvm/Analysis/ScopedNoAliasAA.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" #include "llvm/Support/Casting.h" #include "llvm/Support/TypeSize.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" @@ -2401,6 +2406,7 @@ void VPlanTransforms::optimize(VPlan &Plan) { runPass(removeDeadRecipes, Plan); runPass(createAndOptimizeReplicateRegions, Plan); + runPass(hoistInvariantLoads, Plan); runPass(mergeBlocksIntoPredecessors, Plan); runPass(licm, Plan); } @@ -3914,6 +3920,54 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) { } } +void VPlanTransforms::hoistInvariantLoads(VPlan &Plan) { + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + + // Collect candidate loads with invariant addresses and noalias scopes + // metadata and memory-writing recipes with noalias metadata. + SmallVector> CandidateLoads; + SmallVector Stores; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_shallow(LoopRegion->getEntry()))) { + for (VPRecipeBase &R : *VPBB) { + // Only handle single-scalar replicated loads with invariant addresses. + if (auto *RepR = dyn_cast(&R)) { + if (RepR->isPredicated() || !RepR->isSingleScalar() || + RepR->getOpcode() != Instruction::Load) + continue; + + VPValue *Addr = RepR->getOperand(0); + if (Addr->isDefinedOutsideLoopRegions()) { + MemoryLocation Loc = *vputils::getMemoryLocation(*RepR); + if (!Loc.AATags.Scope) + continue; + CandidateLoads.push_back({RepR, Loc}); + } + } + if (R.mayWriteToMemory()) { + auto Loc = vputils::getMemoryLocation(R); + if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias) + return; + Stores.push_back(*Loc); + } + } + } + + VPBasicBlock *Preheader = Plan.getVectorPreheader(); + for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) { + // Hoist the load to the preheader if it doesn't alias with any stores + // according to the noalias metadata. Other loads should have been hoisted + // by other passes + const AAMDNodes &LoadAA = LoadLoc.AATags; + if (all_of(Stores, [&](const MemoryLocation &StoreLoc) { + return !ScopedNoAliasAAResult::mayAliasInScopes( + LoadAA.Scope, StoreLoc.AATags.NoAlias); + })) { + LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi()); + } + } +} + void VPlanTransforms::materializeConstantVectorTripCount( VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index a44a4f69c917b..708ea4185e1cb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -309,6 +309,11 @@ struct VPlanTransforms { /// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors. static void materializeBroadcasts(VPlan &Plan); + /// Hoist single-scalar loads with invariant addresses out of the vector loop + /// to the preheader, if they are proven not to alias with any stores in the + /// plan using noalias metadata. + static void hoistInvariantLoads(VPlan &Plan); + // Materialize vector trip counts for constants early if it can simply be // computed as (Original TC / VF * UF) * VF * UF. static void diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 3bc2dfd623777..2536d61392ed1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -11,6 +11,7 @@ #include "VPlanDominatorTree.h" #include "VPlanPatternMatch.h" #include "llvm/ADT/TypeSwitch.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" using namespace llvm; @@ -393,3 +394,20 @@ bool VPBlockUtils::isLatch(const VPBlockBase *VPB, return VPB->getNumSuccessors() == 2 && VPBlockUtils::isHeader(VPB->getSuccessors()[1], VPDT); } + +std::optional +vputils::getMemoryLocation(const VPRecipeBase &R) { + return TypeSwitch>(&R) + .Case( + [](auto *S) { + MemoryLocation Loc; + // Populate noalias metadata from VPIRMetadata. + if (MDNode *NoAliasMD = S->getMetadata(LLVMContext::MD_noalias)) + Loc.AATags.NoAlias = NoAliasMD; + if (MDNode *AliasScopeMD = + S->getMetadata(LLVMContext::MD_alias_scope)) + Loc.AATags.Scope = AliasScopeMD; + return Loc; + }) + .Default([](auto *) { return std::nullopt; }); +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 51bafe0846141..38073380eb54c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -13,6 +13,7 @@ #include "llvm/Support/Compiler.h" namespace llvm { +class MemoryLocation; class ScalarEvolution; class SCEV; } // namespace llvm @@ -74,6 +75,11 @@ getRecipesForUncountableExit(VPlan &Plan, SmallVectorImpl &Recipes, SmallVectorImpl &GEPs); +/// Return a MemoryLocation for \p R with noalias metadata populated from +/// \p R, if the recipe is supported and std::nullopt otherwise. The pointer of +/// the location is conservatively set to nullptr. +std::optional getMemoryLocation(const VPRecipeBase &R); + /// Extracts and returns NoWrap and FastMath flags from the induction binop in /// \p ID. inline VPIRFlags getFlagsFromIndDesc(const InductionDescriptor &ID) { diff --git a/llvm/test/Assembler/aarch64-intrinsics-attributes.ll b/llvm/test/Assembler/aarch64-intrinsics-attributes.ll index 33f2758a4b18c..42691bbb01bc8 100644 --- a/llvm/test/Assembler/aarch64-intrinsics-attributes.ll +++ b/llvm/test/Assembler/aarch64-intrinsics-attributes.ll @@ -19,7 +19,7 @@ declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32) ; CHECK: declare <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) [[NO_CALLBACK_NOFREE_NOSYNC_NOUNWIND_READNONE_WILLRETURN]] declare <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) -; CHECK: declare @llvm.aarch64.sve.dup.nxv4i32(, , i32) [[NO_CALLBACK_NOFREE_NOSYNC_NOUNWIND_READNONE_WILLRETURN]] +; CHECK: declare @llvm.aarch64.sve.dup.nxv4i32(, , i32) [[NO_CALLBACK_NOFREE_NOSYNC_NOUNWIND_SPECULATABLE_READNONE_WILLRETURN:#[0-9]+]] declare @llvm.aarch64.sve.dup.nxv4i32(, , i32) ; CHECK: declare void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32>, <4 x i32>, ptr captures(none)) [[NO_CALLBACK_NOFREE_NOSYNC_NOUNWIND_WRITEONLY_WILLRETURN:#[0-9]+]] @@ -33,4 +33,5 @@ declare void @llvm.aarch64.neon.st1x2.v8i16.p0(<8 x i16>, <8 x i16>, ptr) ; CHECK: attributes [[NOFREE_NOUNWIND_WILLRETURN]] = { nofree nounwind willreturn } ; CHECK: attributes [[NO_CALLBACK_NOFREE_NOSYNC_NOUNWIND_READNONE_WILLRETURN]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes [[NO_CALLBACK_NOFREE_NOSYNC_NOUNWIND_SPECULATABLE_READNONE_WILLRETURN]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK: attributes [[NO_CALLBACK_NOFREE_NOSYNC_NOUNWIND_WRITEONLY_WILLRETURN]] = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } diff --git a/llvm/test/Assembler/memory-attribute.ll b/llvm/test/Assembler/memory-attribute.ll index effd4ce7c4548..4c86f8df0e6c1 100644 --- a/llvm/test/Assembler/memory-attribute.ll +++ b/llvm/test/Assembler/memory-attribute.ll @@ -78,3 +78,58 @@ declare void @fn_argmem_read_inaccessiblemem_write() ; CHECK: @fn_argmem_read_inaccessiblemem_write_reordered() declare void @fn_argmem_read_inaccessiblemem_write_reordered() memory(inaccessiblemem: write, argmem: read) + +; CHECK: Function Attrs: memory(target_mem0: write) +; CHECK: @fn_write_mem_target0() +declare void @fn_write_mem_target0() + memory(target_mem0: write) + +; CHECK: Function Attrs: memory(target_mem0: read) +; CHECK: @fn_read_mem_target0() +declare void @fn_read_mem_target0() + memory(target_mem0: read) + +; CHECK: Function Attrs: memory(target_mem1: write) +; CHECK: @fn_write_target_mem1() +declare void @fn_write_target_mem1() + memory(target_mem1: write) + +; CHECK: Function Attrs: memory(target_mem1: read) +; CHECK: @fn_read_target_mem1() +declare void @fn_read_target_mem1() + memory(target_mem1: read) + +; CHECK: Function Attrs: memory(target_mem0: read, target_mem1: write) +; CHECK: @fn_read_target_mem0_write_mem_target1() +declare void @fn_read_target_mem0_write_mem_target1() + memory(target_mem0: read, target_mem1: write) + +; CHECK: Function Attrs: memory(inaccessiblemem: write) +; CHECK: @fn_inaccessiblemem_write_new() +declare void @fn_inaccessiblemem_write_new() + memory(inaccessiblemem: write) + +; CHECK: Function Attrs: memory(inaccessiblemem: read, target_mem0: read, target_mem1: read) +; CHECK: @fn_inaccessiblemem_target_mem0_1read() +declare void @fn_inaccessiblemem_target_mem0_1read() + memory(inaccessiblemem: read, target_mem0: read, target_mem1: read) + +; CHECK: Function Attrs: memory(target_mem0: read) +; CHECK: @fn_inaccessiblemem_none_target_mem0_read() +declare void @fn_inaccessiblemem_none_target_mem0_read() + memory(inaccessiblemem: none, target_mem0: read) + +; CHECK: Function Attrs: memory(write, inaccessiblemem: read) +; CHECK: @fn_write_inaccessiblemem_read_target_mem0_write +declare void @fn_write_inaccessiblemem_read_target_mem0_write() + memory(write, inaccessiblemem: read, target_mem0: write) + +; CHECK: Function Attrs: memory(write, target_mem0: read) +; CHECK: @fn_write_inaccessiblemem_write_target_mem0_read() +declare void @fn_write_inaccessiblemem_write_target_mem0_read() + memory(write, inaccessiblemem: write, target_mem0: read) + +; CHECK: Function Attrs: memory(write, target_mem0: read) +; CHECK: @fn_write_target_mem0_readwrite() +declare void @fn_write_target_mem0_readwrite() + memory(write, target_mem0: read) diff --git a/llvm/test/Bitcode/memory-attribute-upgrade.ll b/llvm/test/Bitcode/memory-attribute-upgrade.ll index 915b62a88935d..334a344b96f7f 100644 --- a/llvm/test/Bitcode/memory-attribute-upgrade.ll +++ b/llvm/test/Bitcode/memory-attribute-upgrade.ll @@ -1,7 +1,7 @@ ; RUN: llvm-dis < %S/Inputs/memory-attribute-upgrade.bc | FileCheck %s -; CHECK: ; Function Attrs: memory(write, argmem: read) +; CHECK: ; Function Attrs: memory(write, argmem: read, target_mem0: none, target_mem1: none) ; CHECK-NEXT: define void @test_any_write_argmem_read(ptr %p) -; CHECK: ; Function Attrs: memory(read, argmem: readwrite, inaccessiblemem: none) +; CHECK: ; Function Attrs: memory(read, argmem: readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; CHECK-NEXT: define void @test_any_read_argmem_readwrite(ptr %p) diff --git a/llvm/test/CodeGen/AArch64/expand-sme-pseudos.mir b/llvm/test/CodeGen/AArch64/expand-sme-pseudos.mir index 6ca9b9b6cb200..9b745d56c4b7f 100644 --- a/llvm/test/CodeGen/AArch64/expand-sme-pseudos.mir +++ b/llvm/test/CodeGen/AArch64/expand-sme-pseudos.mir @@ -62,7 +62,7 @@ body: | ; CHECK-NEXT: RET undef $lr $x8 = MRS 56965, implicit-def $nzcv - CommitZASavePseudo $x8, 0, &__arm_tpidr2_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x0 + CommitZASavePseudo $x8, 0, 0, &__arm_tpidr2_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x0 RET_ReallyLR @@ -94,7 +94,72 @@ body: | ; CHECK-NEXT: RET undef $lr $x8 = MRS 56965, implicit-def $nzcv - CommitZASavePseudo $x8, 1, &__arm_tpidr2_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x0, implicit-def $zab0 + CommitZASavePseudo $x8, 1, 0, &__arm_tpidr2_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x0, implicit-def $zab0 + + RET_ReallyLR + +... +--- +# X8 = TPIDR2_EL0 +name: commit_za_save_zero_zt0 +alignment: 4 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: commit_za_save_zero_zt0 + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x8 = MRS 56965, implicit-def $nzcv + ; CHECK-NEXT: CBNZX $x8, %bb.1 + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $x8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BL &__arm_tpidr2_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x0, implicit-def $lr, implicit $sp, implicit-def $zt0 + ; CHECK-NEXT: MSR 56965, $xzr + ; CHECK-NEXT: $zt0 = ZERO_T + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .2: + ; CHECK-NEXT: RET undef $lr + $x8 = MRS 56965, implicit-def $nzcv + + CommitZASavePseudo $x8, 0, 1, &__arm_tpidr2_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x0, implicit-def $zt0 + + RET_ReallyLR + +... +--- +# X8 = TPIDR2_EL0 +name: commit_za_save_zero_everything +alignment: 4 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: commit_za_save_zero_everything + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x8 = MRS 56965, implicit-def $nzcv + ; CHECK-NEXT: CBNZX $x8, %bb.1 + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $x8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BL &__arm_tpidr2_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x0, implicit-def $lr, implicit $sp, implicit-def $zab0, implicit-def $zt0 + ; CHECK-NEXT: MSR 56965, $xzr + ; CHECK-NEXT: ZERO_M 255, implicit-def $zab0 + ; CHECK-NEXT: $zt0 = ZERO_T + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .2: + ; CHECK-NEXT: RET undef $lr + $x8 = MRS 56965, implicit-def $nzcv + + CommitZASavePseudo $x8, 1, 1, &__arm_tpidr2_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x0, implicit-def $zab0, implicit-def $zt0 RET_ReallyLR diff --git a/llvm/test/CodeGen/AArch64/llround-conv-fp16.ll b/llvm/test/CodeGen/AArch64/llround-conv-fp16.ll index cb042757a4a42..3a4be1bda7cd6 100644 --- a/llvm/test/CodeGen/AArch64/llround-conv-fp16.ll +++ b/llvm/test/CodeGen/AArch64/llround-conv-fp16.ll @@ -1,12 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK-NOFP16 ; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK-FP16 -; RUN: llc < %s -mtriple=aarch64 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK-NOFP16,CHECK-GI -; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-GI - -; CHECK-GI: warning: Instruction selection used fallback path for testmhhs -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for testmhws -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for testmhxs +; RUN: llc < %s -mtriple=aarch64 -global-isel | FileCheck %s --check-prefixes=CHECK-NOFP16 +; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 -global-isel | FileCheck %s --check-prefixes=CHECK-FP16 define i16 @testmhhs(half %x) { ; CHECK-NOFP16-LABEL: testmhhs: diff --git a/llvm/test/CodeGen/AArch64/llround-conv.ll b/llvm/test/CodeGen/AArch64/llround-conv.ll index 4cc089804ce97..bdee73076347a 100644 --- a/llvm/test/CodeGen/AArch64/llround-conv.ll +++ b/llvm/test/CodeGen/AArch64/llround-conv.ll @@ -1,9 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc < %s -mtriple=aarch64 -mattr=+neon | FileCheck %s -; RUN: llc < %s -mtriple=aarch64 -mattr=+neon -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI - -; CHECK-GI: warning: Instruction selection used fallback path for testmswl -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for testmsll +; RUN: llc < %s -mtriple=aarch64 -mattr=+neon -global-isel | FileCheck %s define i32 @testmsws(float %x) { ; CHECK-LABEL: testmsws: diff --git a/llvm/test/CodeGen/AArch64/lround-conv-fp16.ll b/llvm/test/CodeGen/AArch64/lround-conv-fp16.ll index a29dea0eb9f9f..0b18f220067ca 100644 --- a/llvm/test/CodeGen/AArch64/lround-conv-fp16.ll +++ b/llvm/test/CodeGen/AArch64/lround-conv-fp16.ll @@ -1,12 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK-NOFP16 ; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK-FP16 -; RUN: llc < %s -mtriple=aarch64 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK-NOFP16,CHECK-GI -; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-GI - -; CHECK-GI: warning: Instruction selection used fallback path for testmhhs -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for testmhws -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for testmhxs +; RUN: llc < %s -mtriple=aarch64 -global-isel | FileCheck %s --check-prefixes=CHECK-NOFP16 +; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 -global-isel | FileCheck %s --check-prefixes=CHECK-FP16 define i16 @testmhhs(half %x) { ; CHECK-NOFP16-LABEL: testmhhs: diff --git a/llvm/test/CodeGen/AArch64/lround-conv.ll b/llvm/test/CodeGen/AArch64/lround-conv.ll index 0bf82b538e70c..4b1782457cc10 100644 --- a/llvm/test/CodeGen/AArch64/lround-conv.ll +++ b/llvm/test/CodeGen/AArch64/lround-conv.ll @@ -1,9 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc < %s -mtriple=aarch64 -mattr=+neon | FileCheck %s -; RUN: llc < %s -mtriple=aarch64 -mattr=+neon -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI - -; CHECK-GI: warning: Instruction selection used fallback path for testmswl -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for testmsll +; RUN: llc < %s -mtriple=aarch64 -mattr=+neon -global-isel | FileCheck %s define i32 @testmsws(float %x) { ; CHECK-LABEL: testmsws: diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-add-predicated.ll b/llvm/test/CodeGen/AArch64/partial-reduction-add-predicated.ll new file mode 100644 index 0000000000000..24cdd0a852222 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/partial-reduction-add-predicated.ll @@ -0,0 +1,159 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64" + +define <4 x i32> @predicate_dot_fixed_length(<4 x i32> %acc, <16 x i1> %p, <16 x i8> %a, <16 x i8> %b) #0 { +; CHECK-LABEL: predicate_dot_fixed_length: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: sdot v0.4s, v2.16b, v1.16b +; CHECK-NEXT: ret + %ext.1 = sext <16 x i8> %a to <16 x i32> + %ext.2 = sext <16 x i8> %b to <16 x i32> + %mul = mul nsw <16 x i32> %ext.1, %ext.2 + %sel = select <16 x i1> %p, <16 x i32> %mul, <16 x i32> zeroinitializer + %red = call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %sel) + ret <4 x i32> %red +} + +define <4 x i32> @predicate_dot_by_C_fixed_length(<4 x i32> %acc, <16 x i1> %p, <16 x i8> %a) #0 { +; CHECK-LABEL: predicate_dot_by_C_fixed_length: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-NEXT: movi v3.16b, #127 +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: sdot v0.4s, v2.16b, v1.16b +; CHECK-NEXT: ret + %ext.1 = sext <16 x i8> %a to <16 x i32> + %mul = mul nsw <16 x i32> %ext.1, splat(i32 127) + %sel = select <16 x i1> %p, <16 x i32> %mul, <16 x i32> zeroinitializer + %red = call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %sel) + ret <4 x i32> %red +} + +define @predicate_dot_scalable( %acc, %p, %a, %b) #0 { +; CHECK-LABEL: predicate_dot_scalable: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: sel z2.b, p0, z2.b, z3.b +; CHECK-NEXT: sdot z0.s, z1.b, z2.b +; CHECK-NEXT: ret + %ext.1 = sext %a to + %ext.2 = sext %b to + %mul = mul nsw %ext.1, %ext.2 + %sel = select %p, %mul, zeroinitializer + %red = call @llvm.vector.partial.reduce.add( %acc, %sel) + ret %red +} + +define @predicate_dot_by_C_scalable( %acc, %p, %a) #0 { +; CHECK-LABEL: predicate_dot_by_C_scalable: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.b, p0/z, #127 // =0x7f +; CHECK-NEXT: sdot z0.s, z1.b, z2.b +; CHECK-NEXT: ret + %ext.1 = sext %a to + %mul = mul nsw %ext.1, splat(i32 127) + %sel = select %p, %mul, zeroinitializer + %red = call @llvm.vector.partial.reduce.add( %acc, %sel) + ret %red +} + +define <4 x i32> @predicate_ext_mul_fixed_length(<4 x i32> %acc, <16 x i1> %p, <16 x i8> %a) #0 { +; CHECK-LABEL: predicate_ext_mul_fixed_length: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v3.16b, #1 +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: sdot v0.4s, v2.16b, v1.16b +; CHECK-NEXT: ret + %ext = sext <16 x i8> %a to <16 x i32> + %sel = select <16 x i1> %p, <16 x i32> %ext, <16 x i32> zeroinitializer + %red = call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %sel) + ret <4 x i32> %red +} + +define @predicate_ext_mul_scalable( %acc, %p, %a) #0 { +; CHECK-LABEL: predicate_ext_mul_scalable: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.b, p0/z, #1 // =0x1 +; CHECK-NEXT: sdot z0.s, z1.b, z2.b +; CHECK-NEXT: ret + %ext = sext %a to + %sel = select %p, %ext, zeroinitializer + %red = call @llvm.vector.partial.reduce.add( %acc, %sel) + ret %red +} + +define <4 x float> @predicated_fdot_fixed_length(<4 x float> %acc, <8 x i1> %p, <8 x half> %a, <8 x half> %b) #1 { +; CHECK-LABEL: predicated_fdot_fixed_length: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2 +; CHECK-NEXT: shl v1.8h, v1.8h, #15 +; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: fdot z0.s, z2.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %ext.1 = fpext <8 x half> %a to <8 x float> + %ext.2 = fpext <8 x half> %b to <8 x float> + %mul = fmul <8 x float> %ext.1, %ext.2 + %sel = select <8 x i1> %p, <8 x float> %mul, <8 x float> zeroinitializer + %red = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %sel) + ret <4 x float> %red +} + +define @predicated_fdot_scalable( %acc, %p, %a, %b) #1 { +; CHECK-LABEL: predicated_fdot_scalable: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: sel z2.h, p0, z2.h, z3.h +; CHECK-NEXT: fdot z0.s, z1.h, z2.h +; CHECK-NEXT: ret + %ext.1 = fpext %a to + %ext.2 = fpext %b to + %mul = fmul %ext.1, %ext.2 + %sel = select %p, %mul, zeroinitializer + %red = call @llvm.vector.partial.reduce.fadd( %acc, %sel) + ret %red +} + +define <4 x float> @predicated_fpext_fmul_fixed_length(<4 x float> %acc, <8 x i1> %p, <8 x half> %a) #1 { +; CHECK-LABEL: predicated_fpext_fmul_fixed_length: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: movi v3.8h, #60, lsl #8 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2 +; CHECK-NEXT: shl v1.8h, v1.8h, #15 +; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: fdot z0.s, z2.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %ext = fpext <8 x half> %a to <8 x float> + %sel = select <8 x i1> %p, <8 x float> %ext, <8 x float> zeroinitializer + %red = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %sel) + ret <4 x float> %red +} + +define @predicated_fpext_fmul_scalable( %acc, %p, %a) #1 { +; CHECK-LABEL: predicated_fpext_fmul_scalable: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: fmov z2.h, p0/m, #1.00000000 +; CHECK-NEXT: fdot z0.s, z1.h, z2.h +; CHECK-NEXT: ret + %ext = fpext %a to + %sel = select %p, %ext, zeroinitializer + %red = call @llvm.vector.partial.reduce.fadd( %acc, %sel) + ret %red +} + +attributes #0 = { nounwind "target-features"="+sve,+dotprod" } +attributes #1 = { nounwind "target-features"="+sve2p1,+dotprod" } diff --git a/llvm/test/CodeGen/RISCV/pipeline-options.ll b/llvm/test/CodeGen/RISCV/pipeline-options.ll new file mode 100644 index 0000000000000..26c9aaba09c94 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/pipeline-options.ll @@ -0,0 +1,35 @@ +; RUN: llc -mtriple=riscv64 -O3 \ +; RUN: -debug-pass=Structure < %s -o /dev/null 2>&1 | \ +; RUN: FileCheck %s --check-prefix=O3-WITHOUT-ENABLE-CFI-INSTR-INSERTER + +; RUN: llc -mtriple=riscv64 -O3 \ +; RUN: --riscv-enable-cfi-instr-inserter=true \ +; RUN: -debug-pass=Structure < %s -o /dev/null 2>&1 | \ +; RUN: FileCheck %s --check-prefix=O3-ENABLE-CFI-INSTR-INSERTER + +; RUN: llc -mtriple=riscv64 -O0 \ +; RUN: -debug-pass=Structure < %s -o /dev/null 2>&1 | \ +; RUN: FileCheck %s --check-prefix=O0-WITHOUT-ENABLE-CFI-INSTR-INSERTER + +; RUN: llc -mtriple=riscv64 -O0 \ +; RUN: --riscv-enable-cfi-instr-inserter=true \ +; RUN: -debug-pass=Structure < %s -o /dev/null 2>&1 | \ +; RUN: FileCheck %s --check-prefix=O0-ENABLE-CFI-INSTR-INSERTER + +; REQUIRES: asserts + +; O3-WITHOUT-ENABLE-CFI-INSTR-INSERTER-LABEL: Pass Arguments: +; NO-O3-WITHOUT-ENABLE-CFI-INSTR-INSERTER: Check CFA info and insert CFI instructions if needed +; O3-WITHOUT-ENABLE-CFI-INSTR-INSERTER: Insert CFI remember/restore state instructions + +; O3-ENABLE-CFI-INSTR-INSERTER-LABEL: Pass Arguments: +; O3-ENABLE-CFI-INSTR-INSERTER: Check CFA info and insert CFI instructions if needed +; NO-O3-ENABLE-CFI-INSTR-INSERTER: Insert CFI remember/restore state instructions + +; O0-WITHOUT-ENABLE-CFI-INSTR-INSERTER-LABEL: Pass Arguments: +; NO-O0-WITHOUT-ENABLE-CFI-INSTR-INSERTER: Check CFA info and insert CFI instructions if needed +; O0-WITHOUT-ENABLE-CFI-INSTR-INSERTER: Insert CFI remember/restore state instructions + +; O0-ENABLE-CFI-INSTR-INSERTER-LABEL: Pass Arguments: +; O0-ENABLE-CFI-INSTR-INSERTER: Check CFA info and insert CFI instructions if needed +; NO-O0-ENABLE-CFI-INSTR-INSERTER: Insert CFI remember/restore state instructions diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-live-out.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-live-out.ll new file mode 100644 index 0000000000000..cf15fad5533b9 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-live-out.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s + +define i32 @loop_live_out(ptr %p, i64 %n) { +; CHECK-LABEL: loop_live_out: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mv a2, a0 +; CHECK-NEXT: .LBB0_1: # %loop +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli a3, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v8, (a2) +; CHECK-NEXT: sub a1, a1, a3 +; CHECK-NEXT: vadd.vi v8, v8, 1 +; CHECK-NEXT: vse32.v v8, (a2) +; CHECK-NEXT: slli a2, a3, 2 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: bnez a1, .LBB0_1 +; CHECK-NEXT: # %bb.2: # %exit +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a3 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + br label %loop + +loop: + %avl = phi i64 [%n, %entry], [%avl.next, %loop] + %gep = phi ptr [%p, %entry], [%gep.next, %loop] + %vl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true) + %x = call @llvm.vp.load(ptr %gep, splat (i1 true), i32 %vl) + %y = add %x, splat (i32 1) + call void @llvm.vp.store( %y, ptr %gep, splat (i1 true), i32 %vl) + %vl.zext = zext i32 %vl to i64 + %avl.next = sub i64 %avl, %vl.zext + %gep.next = getelementptr i32, ptr %p, i32 %vl + %ec = icmp eq i64 %avl.next, 0 + br i1 %ec, label %exit, label %loop + +exit: + %lastidx = sub i64 %vl.zext, 1 + %lastelt = extractelement %y, i64 %lastidx + ret i32 %lastelt +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir index 4d6d0e122b1cf..55d1c84d5f8d3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir @@ -778,3 +778,38 @@ body: | ; CHECK: DBG_VALUE %0:vr DBG_VALUE %0:vr ... +--- +name: vslidedown_vx +tracksRegLiveness: true +body: | + bb.0: + liveins: $x8 + ; CHECK-LABEL: name: vslidedown_vx + ; CHECK: liveins: $x8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %x:gprnox0 = COPY $x8 + ; CHECK-NEXT: %y:gprnox0 = ADDI %x, -1 + ; CHECK-NEXT: %v:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, %x, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: %w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %v, %y, 1, 5 /* e32 */, 0 /* tu, mu */ + %x:gpr = COPY $x8 + %y:gprnox0 = ADDI %x, -1 + %v:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 /* tu, mu */ + %w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %v, %y, 1, 5 /* e32 */, 0 /* tu, mu */ +... +--- +# Make sure we ignore LIs (ADDI $x0, -1) +name: vslidedown_vx_li +tracksRegLiveness: true +body: | + bb.0: + liveins: $x8 + ; CHECK-LABEL: name: vslidedown_vx_li + ; CHECK: liveins: $x8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %y:gprnox0 = ADDI $x0, -1 + ; CHECK-NEXT: %v:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: %w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %v, %y, 1, 5 /* e32 */, 0 /* tu, mu */ + %y:gprnox0 = ADDI $x0, -1 + %v:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 /* tu, mu */ + %w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %v, %y, 1, 5 /* e32 */, 0 /* tu, mu */ +... diff --git a/llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll index 72bf1fa9a8327..d6384a6913efe 100644 --- a/llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll @@ -996,20 +996,31 @@ entry: } define @partial_reduce_select( %a, %b, %m) { -; CHECK-LABEL: partial_reduce_select: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; CHECK-NEXT: vsext.vf2 v12, v8 -; CHECK-NEXT: vsext.vf2 v14, v9 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vwmul.vv v8, v12, v14, v0.t -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; CHECK-NEXT: vadd.vv v8, v11, v8 -; CHECK-NEXT: vadd.vv v9, v9, v10 -; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: ret +; NODOT-LABEL: partial_reduce_select: +; NODOT: # %bb.0: # %entry +; NODOT-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; NODOT-NEXT: vsext.vf2 v12, v8 +; NODOT-NEXT: vsext.vf2 v14, v9 +; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; NODOT-NEXT: vmv.v.i v8, 0 +; NODOT-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; NODOT-NEXT: vwmul.vv v8, v12, v14, v0.t +; NODOT-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; NODOT-NEXT: vadd.vv v8, v11, v8 +; NODOT-NEXT: vadd.vv v9, v9, v10 +; NODOT-NEXT: vadd.vv v8, v9, v8 +; NODOT-NEXT: ret +; +; DOT-LABEL: partial_reduce_select: +; DOT: # %bb.0: # %entry +; DOT-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; DOT-NEXT: vmv.v.i v10, 0 +; DOT-NEXT: vmerge.vvm v10, v10, v9, v0 +; DOT-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; DOT-NEXT: vmv.v.i v9, 0 +; DOT-NEXT: vqdot.vv v9, v8, v10 +; DOT-NEXT: vmv.v.v v8, v9 +; DOT-NEXT: ret entry: %a.sext = sext %a to %b.sext = sext %b to diff --git a/llvm/test/CodeGen/WebAssembly/simd-extadd.ll b/llvm/test/CodeGen/WebAssembly/simd-extadd.ll new file mode 100644 index 0000000000000..dfc47a6abf03a --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-extadd.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -O2 -mtriple=wasm32 -mattr=+simd128 | FileCheck %s + +target triple = "wasm32-unknown-unknown" + +; Test that adding two extended shuffles from the same vector that ends w/ an add converts to extadd_pairwise + +define <8 x i16> @test_extadd_pairwise_i8x16_s(<16 x i8> %v) { +; CHECK-LABEL: test_extadd_pairwise_i8x16_s: +; CHECK: .functype test_extadd_pairwise_i8x16_s (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.extadd_pairwise_i8x16_s +; CHECK-NEXT: # fallthrough-return + %even = shufflevector <16 x i8> %v, <16 x i8> poison, <8 x i32> + %odd = shufflevector <16 x i8> %v, <16 x i8> poison, <8 x i32> + %even_ext = sext <8 x i8> %even to <8 x i16> + %odd_ext = sext <8 x i8> %odd to <8 x i16> + %result = add <8 x i16> %even_ext, %odd_ext + ret <8 x i16> %result +} + +define <8 x i16> @test_extadd_pairwise_i8x16_u(<16 x i8> %v) { +; CHECK-LABEL: test_extadd_pairwise_i8x16_u: +; CHECK: .functype test_extadd_pairwise_i8x16_u (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.extadd_pairwise_i8x16_u +; CHECK-NEXT: # fallthrough-return + %even = shufflevector <16 x i8> %v, <16 x i8> poison, <8 x i32> + %odd = shufflevector <16 x i8> %v, <16 x i8> poison, <8 x i32> + %even_ext = zext <8 x i8> %even to <8 x i16> + %odd_ext = zext <8 x i8> %odd to <8 x i16> + %result = add <8 x i16> %even_ext, %odd_ext + ret <8 x i16> %result +} + +define <4 x i32> @test_extadd_pairwise_i16x8_s(<8 x i16> %v) { +; CHECK-LABEL: test_extadd_pairwise_i16x8_s: +; CHECK: .functype test_extadd_pairwise_i16x8_s (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.extadd_pairwise_i16x8_s +; CHECK-NEXT: # fallthrough-return + %even = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> + %odd = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> + %even_ext = sext <4 x i16> %even to <4 x i32> + %odd_ext = sext <4 x i16> %odd to <4 x i32> + %result = add <4 x i32> %even_ext, %odd_ext + ret <4 x i32> %result +} + +define <4 x i32> @test_extadd_pairwise_i16x8_u(<8 x i16> %v) { +; CHECK-LABEL: test_extadd_pairwise_i16x8_u: +; CHECK: .functype test_extadd_pairwise_i16x8_u (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.extadd_pairwise_i16x8_u +; CHECK-NEXT: # fallthrough-return + %even = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> + %odd = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> + %even_ext = zext <4 x i16> %even to <4 x i32> + %odd_ext = zext <4 x i16> %odd to <4 x i32> + %result = add <4 x i32> %even_ext, %odd_ext + ret <4 x i32> %result +} + +; Negative test: shuffling mask doesn't fit pattern +define <4 x i32> @negative_test_extadd_pairwise_i16x8_u(<8 x i16> %v) { +; CHECK-LABEL: negative_test_extadd_pairwise_i16x8_u: +; CHECK: .functype negative_test_extadd_pairwise_i16x8_u (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.shuffle 0, 1, 6, 7, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK-NEXT: i32x4.extend_low_i16x8_u +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK-NEXT: i32x4.extend_low_i16x8_u +; CHECK-NEXT: i32x4.add +; CHECK-NEXT: # fallthrough-return + %even = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> + %odd = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> + %even_ext = zext <4 x i16> %even to <4 x i32> + %odd_ext = zext <4 x i16> %odd to <4 x i32> + %result = add <4 x i32> %even_ext, %odd_ext + ret <4 x i32> %result +} diff --git a/llvm/test/CodeGen/X86/isel-arg-attrs.ll b/llvm/test/CodeGen/X86/isel-arg-attrs.ll new file mode 100644 index 0000000000000..3afee76715d6d --- /dev/null +++ b/llvm/test/CodeGen/X86/isel-arg-attrs.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=X64 + +; The src array should be in R10 or ECX register due to nest attribute +define i32 @nest_arg(ptr nest %src) { +; X86-LABEL: nest_arg: +; X86: # %bb.0: +; X86-NEXT: movl 8(%ecx), %eax +; X86-NEXT: retl +; +; X64-LABEL: nest_arg: +; X64: # %bb.0: +; X64-NEXT: movl 8(%r10), %eax +; X64-NEXT: retq + %off = getelementptr [3 x i32], ptr %src, i32 0, i32 2 + %ret = load i32, ptr %off + ret i32 %ret +} diff --git a/llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s b/llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s index cffee7dbbe31e..84860857c3b8f 100644 --- a/llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s +++ b/llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s @@ -16,3 +16,7 @@ gicr x3, foo gic cdaff // CHECK-ERROR: error: specified gic op requires a register + +gic cdeoi, x3 +// CHECK-ERROR: error: specified gic op does not use a register + diff --git a/llvm/test/MC/AArch64/armv9.7a-gcie.s b/llvm/test/MC/AArch64/armv9.7a-gcie.s index 4fd5d2577e26a..74e95015f6c86 100644 --- a/llvm/test/MC/AArch64/armv9.7a-gcie.s +++ b/llvm/test/MC/AArch64/armv9.7a-gcie.s @@ -828,10 +828,10 @@ GIC CDEN, x3 // CHECK-UNKNOWN: d508c123 sys #0, c12, c1, #1, x3 // CHECK-ERROR: error: GIC cden requires: gcie -GIC CDEOI, x3 -// CHECK-INST: gic cdeoi, x3 -// CHECK-ENCODING: [0xe3,0xc1,0x08,0xd5] -// CHECK-UNKNOWN: d508c1e3 sys #0, c12, c1, #7, x3 +GIC CDEOI +// CHECK-INST: gic cdeoi +// CHECK-ENCODING: [0xff,0xc1,0x08,0xd5] +// CHECK-UNKNOWN: d508c1ff sys #0, c12, c1, #7 // CHECK-ERROR: error: GIC cdeoi requires: gcie GIC CDHM, x3 diff --git a/llvm/test/TableGen/target-mem-intrinsic-attrs.td b/llvm/test/TableGen/target-mem-intrinsic-attrs.td new file mode 100644 index 0000000000000..fc9c3321ad9e9 --- /dev/null +++ b/llvm/test/TableGen/target-mem-intrinsic-attrs.td @@ -0,0 +1,78 @@ +// RUN: llvm-tblgen -gen-intrinsic-impl -I %p/../../include -DTEST_INTRINSICS_SUPPRESS_DEFS %s | FileCheck %s + +include "llvm/IR/Intrinsics.td" + +def int_aarch64_get_target_mem0_mem1 : DefaultAttrsIntrinsic<[], [llvm_i64_ty], [IntrReadMem, IntrRead<[TargetMem0, TargetMem1]>]>; + +def int_aarch64_get_target_mem0_set_target_mem1 : DefaultAttrsIntrinsic<[], [llvm_i64_ty], [IntrRead<[TargetMem0]>, IntrWrite<[TargetMem1]>]>; + +def int_aarch64_get_target_mem1 : DefaultAttrsIntrinsic<[], [llvm_i64_ty], [IntrReadMem, IntrRead<[TargetMem1]>]>; + +def int_aarch64_get_target_mem1_set_target_mem1 : DefaultAttrsIntrinsic<[], [llvm_i64_ty], [IntrRead<[TargetMem1]>, IntrWrite<[TargetMem1]>]>; + +def int_aarch64_set_inaccessible_mem : DefaultAttrsIntrinsic<[], [llvm_i64_ty], [IntrWriteMem, IntrWrite<[InaccessibleMem]>]>; + +def int_aarch64_set_target_mem0 : DefaultAttrsIntrinsic<[], [llvm_i64_ty], [IntrWriteMem, IntrWrite<[TargetMem0]>]>; + +// CHECK: static AttributeSet getIntrinsicFnAttributeSet(LLVMContext &C, unsigned ID) { +// CHECK-NEXT: switch (ID) { +// CHECK-NEXT: default: llvm_unreachable("Invalid attribute set number"); +// CHECK-NEXT: case 0: // llvm.aarch64.get.target.mem0.mem1 +// CHECK-NEXT: return AttributeSet::get(C, { +// CHECK-NEXT: Attribute::get(C, Attribute::NoUnwind), +// CHECK-NEXT: Attribute::get(C, Attribute::NoCallback), +// CHECK-NEXT: Attribute::get(C, Attribute::NoSync), +// CHECK-NEXT: Attribute::get(C, Attribute::NoFree), +// CHECK-NEXT: Attribute::get(C, Attribute::WillReturn), +// CHECK-NEXT: // ArgMem: NoModRef, InaccessibleMem: NoModRef, ErrnoMem: NoModRef, Other: NoModRef, TargetMem0: Ref, TargetMem1: Ref +// CHECK-NEXT: Attribute::getWithMemoryEffects(C, MemoryEffects::createFromIntValue(1280)), +// CHECK-NEXT: }); +// CHECK-NEXT: case 1: // llvm.aarch64.get.target.mem0.set.target.mem1 +// CHECK-NEXT: return AttributeSet::get(C, { +// CHECK-NEXT: Attribute::get(C, Attribute::NoUnwind), +// CHECK-NEXT: Attribute::get(C, Attribute::NoCallback), +// CHECK-NEXT: Attribute::get(C, Attribute::NoSync), +// CHECK-NEXT: Attribute::get(C, Attribute::NoFree), +// CHECK-NEXT: Attribute::get(C, Attribute::WillReturn), +// CHECK-NEXT: // ArgMem: NoModRef, InaccessibleMem: NoModRef, ErrnoMem: NoModRef, Other: NoModRef, TargetMem0: Ref, TargetMem1: Mod +// CHECK-NEXT: Attribute::getWithMemoryEffects(C, MemoryEffects::createFromIntValue(2304)), +// CHECK-NEXT: }); +// CHECK-NEXT: case 2: // llvm.aarch64.get.target.mem1 +// CHECK-NEXT: return AttributeSet::get(C, { +// CHECK-NEXT: Attribute::get(C, Attribute::NoUnwind), +// CHECK-NEXT: Attribute::get(C, Attribute::NoCallback), +// CHECK-NEXT: Attribute::get(C, Attribute::NoSync), +// CHECK-NEXT: Attribute::get(C, Attribute::NoFree), +// CHECK-NEXT: Attribute::get(C, Attribute::WillReturn), +// CHECK-NEXT: // ArgMem: NoModRef, InaccessibleMem: NoModRef, ErrnoMem: NoModRef, Other: NoModRef, TargetMem0: NoModRef, TargetMem1: Ref +// CHECK-NEXT: Attribute::getWithMemoryEffects(C, MemoryEffects::createFromIntValue(1024)), +// CHECK-NEXT: }); +// CHECK-NEXT: case 3: // llvm.aarch64.get.target.mem1.set.target.mem1 +// CHECK-NEXT: return AttributeSet::get(C, { +// CHECK-NEXT: Attribute::get(C, Attribute::NoUnwind), +// CHECK-NEXT: Attribute::get(C, Attribute::NoCallback), +// CHECK-NEXT: Attribute::get(C, Attribute::NoSync), +// CHECK-NEXT: Attribute::get(C, Attribute::NoFree), +// CHECK-NEXT: Attribute::get(C, Attribute::WillReturn), +// CHECK-NEXT: // ArgMem: NoModRef, InaccessibleMem: NoModRef, ErrnoMem: NoModRef, Other: NoModRef, TargetMem0: NoModRef, TargetMem1: ModRef +// CHECK-NEXT: Attribute::getWithMemoryEffects(C, MemoryEffects::createFromIntValue(3072)), +// CHECK-NEXT: }); +// CHECK-NEXT: case 4: // llvm.aarch64.set.inaccessible.mem +// CHECK-NEXT: return AttributeSet::get(C, { +// CHECK-NEXT: Attribute::get(C, Attribute::NoUnwind), +// CHECK-NEXT: Attribute::get(C, Attribute::NoCallback), +// CHECK-NEXT: Attribute::get(C, Attribute::NoSync), +// CHECK-NEXT: Attribute::get(C, Attribute::NoFree), +// CHECK-NEXT: Attribute::get(C, Attribute::WillReturn), +// CHECK-NEXT: // ArgMem: NoModRef, InaccessibleMem: Mod, ErrnoMem: NoModRef, Other: NoModRef, TargetMem0: NoModRef, TargetMem1: NoModRef +// CHECK-NEXT: Attribute::getWithMemoryEffects(C, MemoryEffects::createFromIntValue(8)), +// CHECK-NEXT: }); +// CHECK-NEXT: case 5: // llvm.aarch64.set.target.mem0 +// CHECK-NEXT: return AttributeSet::get(C, { +// CHECK-NEXT: Attribute::get(C, Attribute::NoUnwind), +// CHECK-NEXT: Attribute::get(C, Attribute::NoCallback), +// CHECK-NEXT: Attribute::get(C, Attribute::NoSync), +// CHECK-NEXT: Attribute::get(C, Attribute::NoFree), +// CHECK-NEXT: Attribute::get(C, Attribute::WillReturn), +// CHECK-NEXT: // ArgMem: NoModRef, InaccessibleMem: NoModRef, ErrnoMem: NoModRef, Other: NoModRef, TargetMem0: Mod, TargetMem1: NoModRef +// CHECK-NEXT: Attribute::getWithMemoryEffects(C, MemoryEffects::createFromIntValue(512)), diff --git a/llvm/test/ThinLTO/X86/dtlto/dtlto-cache.ll b/llvm/test/ThinLTO/X86/dtlto/dtlto-cache.ll index df98c5e90b1ae..129093452101d 100644 --- a/llvm/test/ThinLTO/X86/dtlto/dtlto-cache.ll +++ b/llvm/test/ThinLTO/X86/dtlto/dtlto-cache.ll @@ -43,18 +43,11 @@ THINLTO-DAG: {{^}}t.o.2{{$}} RUN: %{command} -; Check that the expected output files have been created. RUN: ls | count 3 -; Check that two native object files has been created RUN: ls | FileCheck %s --check-prefix=THINLTO -; Check that DTLTO cache directory has been created RUN: ls cache-dir/* | count 2 -; Check that 2 cache entries are created RUN: ls cache-dir/llvmcache-* | count 2 - - - ;--- t1.ll target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/FunctionAttrs/argmemonly.ll b/llvm/test/Transforms/FunctionAttrs/argmemonly.ll index 42e0e94c1cee3..4ff36c0dbdc3f 100644 --- a/llvm/test/Transforms/FunctionAttrs/argmemonly.ll +++ b/llvm/test/Transforms/FunctionAttrs/argmemonly.ll @@ -56,7 +56,7 @@ entry: } define i32 @test_read_global() { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define i32 @test_read_global ; FNATTRS-SAME: () #[[ATTR2:[0-9]+]] { ; FNATTRS-NEXT: entry: @@ -76,7 +76,7 @@ entry: } define i32 @test_read_loaded_ptr(ptr %ptr) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define i32 @test_read_loaded_ptr ; FNATTRS-SAME: (ptr readonly captures(none) [[PTR:%.*]]) #[[ATTR3:[0-9]+]] { ; FNATTRS-NEXT: entry: @@ -119,7 +119,7 @@ entry: } define void @test_write_global() { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test_write_global ; FNATTRS-SAME: () #[[ATTR5:[0-9]+]] { ; FNATTRS-NEXT: entry: @@ -243,7 +243,7 @@ declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1) @arr = global [32 x i8] zeroinitializer define void @test_memcpy_src_global(ptr %dst) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test_memcpy_src_global ; FNATTRS-SAME: (ptr writeonly captures(none) initializes((0, 32)) [[DST:%.*]]) #[[ATTR11:[0-9]+]] { ; FNATTRS-NEXT: entry: @@ -263,7 +263,7 @@ entry: } define void @test_memcpy_dst_global(ptr %src) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test_memcpy_dst_global ; FNATTRS-SAME: (ptr readonly captures(none) [[SRC:%.*]]) #[[ATTR11]] { ; FNATTRS-NEXT: entry: @@ -388,7 +388,7 @@ define void @test_inaccessibleorargmemonly_readwrite(ptr %arg) { } define void @test_recursive_argmem_read(ptr %p) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(read, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(read, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test_recursive_argmem_read ; FNATTRS-SAME: (ptr readonly captures(none) [[P:%.*]]) #[[ATTR16:[0-9]+]] { ; FNATTRS-NEXT: [[PVAL:%.*]] = load ptr, ptr [[P]], align 8 @@ -408,7 +408,7 @@ define void @test_recursive_argmem_read(ptr %p) { } define void @test_recursive_argmem_readwrite(ptr %p) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(readwrite, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test_recursive_argmem_readwrite ; FNATTRS-SAME: (ptr captures(none) [[P:%.*]]) #[[ATTR17:[0-9]+]] { ; FNATTRS-NEXT: [[PVAL:%.*]] = load ptr, ptr [[P]], align 8 @@ -454,7 +454,7 @@ define void @test_recursive_argmem_read_alloca(ptr %p) { } define void @test_scc_argmem_read_1(ptr %p) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(read, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(read, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test_scc_argmem_read_1 ; FNATTRS-SAME: (ptr readonly captures(none) [[P:%.*]]) #[[ATTR16]] { ; FNATTRS-NEXT: [[PVAL:%.*]] = load ptr, ptr [[P]], align 8 @@ -474,7 +474,7 @@ define void @test_scc_argmem_read_1(ptr %p) { } define void @test_scc_argmem_read_2(ptr %p) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(read, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(read, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test_scc_argmem_read_2 ; FNATTRS-SAME: (ptr readonly captures(none) [[P:%.*]]) #[[ATTR16]] { ; FNATTRS-NEXT: call void @test_scc_argmem_read_1(ptr [[P]]) @@ -518,7 +518,7 @@ entry: ; FIXME: This could be `memory(argmem: read)`. define i64 @select_different_obj(i1 %c, ptr %p, ptr %p2) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define i64 @select_different_obj ; FNATTRS-SAME: (i1 [[C:%.*]], ptr readonly captures(none) [[P:%.*]], ptr readonly captures(none) [[P2:%.*]]) #[[ATTR3]] { ; FNATTRS-NEXT: entry: @@ -580,7 +580,7 @@ join: ; FIXME: This could be `memory(argmem: read)`. define i64 @phi_different_obj(i1 %c, ptr %p, ptr %p2) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define i64 @phi_different_obj ; FNATTRS-SAME: (i1 [[C:%.*]], ptr readonly captures(none) [[P:%.*]], ptr readonly captures(none) [[P2:%.*]]) #[[ATTR3]] { ; FNATTRS-NEXT: entry: diff --git a/llvm/test/Transforms/FunctionAttrs/nocapture.ll b/llvm/test/Transforms/FunctionAttrs/nocapture.ll index 8113ba65fe422..b5b14f571d47d 100644 --- a/llvm/test/Transforms/FunctionAttrs/nocapture.ll +++ b/llvm/test/Transforms/FunctionAttrs/nocapture.ll @@ -20,7 +20,7 @@ define ptr @c1(ptr %q) { ; It would also be acceptable to mark %q as readnone. Update @c3 too. define void @c2(ptr %q) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @c2 ; FNATTRS-SAME: (ptr [[Q:%.*]]) #[[ATTR1:[0-9]+]] { ; FNATTRS-NEXT: store ptr [[Q]], ptr @g, align 8 @@ -37,7 +37,7 @@ define void @c2(ptr %q) { } define void @c3(ptr %q) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @c3 ; FNATTRS-SAME: (ptr [[Q:%.*]]) #[[ATTR2:[0-9]+]] { ; FNATTRS-NEXT: call void @c2(ptr [[Q]]) @@ -127,7 +127,7 @@ l1: @lookup_table = global [2 x i1] [ i1 0, i1 1 ] define i1 @c5(ptr %q, i32 %bitno) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define i1 @c5 ; FNATTRS-SAME: (ptr [[Q:%.*]], i32 [[BITNO:%.*]]) #[[ATTR3:[0-9]+]] { ; FNATTRS-NEXT: [[TMP:%.*]] = ptrtoint ptr [[Q]] to i32 @@ -222,7 +222,7 @@ define ptr @lookup_bit(ptr %q, i32 %bitno) readnone nounwind { } define i1 @c7(ptr %q, i32 %bitno) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define i1 @c7 ; FNATTRS-SAME: (ptr readonly [[Q:%.*]], i32 [[BITNO:%.*]]) #[[ATTR6:[0-9]+]] { ; FNATTRS-NEXT: [[PTR:%.*]] = call ptr @lookup_bit(ptr [[Q]], i32 [[BITNO]]) @@ -243,7 +243,7 @@ define i1 @c7(ptr %q, i32 %bitno) { define i32 @nc1(ptr %q, ptr %p, i1 %b) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define i32 @nc1 ; FNATTRS-SAME: (ptr [[Q:%.*]], ptr captures(none) [[P:%.*]], i1 [[B:%.*]]) #[[ATTR7:[0-9]+]] { ; FNATTRS-NEXT: e: @@ -284,7 +284,7 @@ l: } define i32 @nc1_addrspace(ptr %q, ptr addrspace(1) %p, i1 %b) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define i32 @nc1_addrspace ; FNATTRS-SAME: (ptr [[Q:%.*]], ptr addrspace(1) captures(none) [[P:%.*]], i1 [[B:%.*]]) #[[ATTR7]] { ; FNATTRS-NEXT: e: @@ -328,7 +328,7 @@ l: } define void @nc2(ptr %p, ptr %q) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @nc2 ; FNATTRS-SAME: (ptr captures(none) [[P:%.*]], ptr [[Q:%.*]]) #[[ATTR7]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = call i32 @nc1(ptr [[Q]], ptr [[P]], i1 false) @@ -468,7 +468,7 @@ define void @self_readonly_nounwind_willreturn(ptr %p) readonly nounwind willret ; It would be acceptable to add readnone to %y1_1 and %y1_2. define void @test1_1(ptr %x1_1, ptr %y1_1, i1 %c) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test1_1 ; FNATTRS-SAME: (ptr readnone captures(none) [[X1_1:%.*]], ptr [[Y1_1:%.*]], i1 [[C:%.*]]) #[[ATTR12:[0-9]+]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = call ptr @test1_2(ptr [[X1_1]], ptr [[Y1_1]], i1 [[C]]) @@ -488,7 +488,7 @@ define void @test1_1(ptr %x1_1, ptr %y1_1, i1 %c) { } define ptr @test1_2(ptr %x1_2, ptr %y1_2, i1 %c) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define ptr @test1_2 ; FNATTRS-SAME: (ptr readnone captures(none) [[X1_2:%.*]], ptr returned [[Y1_2:%.*]], i1 [[C:%.*]]) #[[ATTR12]] { ; FNATTRS-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] @@ -520,7 +520,7 @@ f: } define void @test2(ptr %x2) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test2 ; FNATTRS-SAME: (ptr readnone captures(none) [[X2:%.*]]) #[[ATTR12]] { ; FNATTRS-NEXT: call void @test2(ptr [[X2]]) @@ -540,7 +540,7 @@ define void @test2(ptr %x2) { } define void @test3(ptr %x3, ptr %y3, ptr %z3) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test3 ; FNATTRS-SAME: (ptr readnone captures(none) [[X3:%.*]], ptr readnone captures(none) [[Y3:%.*]], ptr readnone captures(none) [[Z3:%.*]]) #[[ATTR12]] { ; FNATTRS-NEXT: call void @test3(ptr [[Z3]], ptr [[Y3]], ptr [[X3]]) @@ -560,7 +560,7 @@ define void @test3(ptr %x3, ptr %y3, ptr %z3) { } define void @test4_1(ptr %x4_1, i1 %c) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test4_1 ; FNATTRS-SAME: (ptr [[X4_1:%.*]], i1 [[C:%.*]]) #[[ATTR12]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = call ptr @test4_2(ptr [[X4_1]], ptr [[X4_1]], ptr [[X4_1]], i1 [[C]]) @@ -580,7 +580,7 @@ define void @test4_1(ptr %x4_1, i1 %c) { } define ptr @test4_2(ptr %x4_2, ptr %y4_2, ptr %z4_2, i1 %c) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define ptr @test4_2 ; FNATTRS-SAME: (ptr readnone captures(none) [[X4_2:%.*]], ptr readnone returned captures(ret: address, provenance) [[Y4_2:%.*]], ptr readnone captures(none) [[Z4_2:%.*]], i1 [[C:%.*]]) #[[ATTR12]] { ; FNATTRS-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] @@ -744,7 +744,7 @@ entry: @g2 = global ptr null define void @captureLaunder(ptr %p) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: readwrite) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: readwrite, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @captureLaunder ; FNATTRS-SAME: (ptr [[P:%.*]]) #[[ATTR16:[0-9]+]] { ; FNATTRS-NEXT: [[B:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[P]]) @@ -788,7 +788,7 @@ entry: @g3 = global ptr null define void @captureStrip(ptr %p) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @captureStrip ; FNATTRS-SAME: (ptr [[P:%.*]]) #[[ATTR1]] { ; FNATTRS-NEXT: [[B:%.*]] = call ptr @llvm.strip.invariant.group.p0(ptr [[P]]) @@ -1086,7 +1086,7 @@ define i64 @captures_not_ret_only(ptr %p) { ;; Unlike ptrtoint, ptrtoaddr only captures the address define i64 @captures_ptrtoaddr_stored(ptr %p) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define noundef i64 @captures_ptrtoaddr_stored ; FNATTRS-SAME: (ptr captures(address) [[P:%.*]]) #[[ATTR1]] { ; FNATTRS-NEXT: [[INT:%.*]] = ptrtoaddr ptr [[P]] to i64 @@ -1189,7 +1189,7 @@ define ptr @captures_used_ret(ptr %p) { ; Make sure this is does not produce captures(ret: ...). We need to take the ; return capture components into account when handling argument SCCs. define ptr @scc_capture_via_ret(i1 %c, ptr %p) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define ptr @scc_capture_via_ret ; FNATTRS-SAME: (i1 [[C:%.*]], ptr [[P:%.*]]) #[[ATTR12]] { ; FNATTRS-NEXT: br i1 [[C]], label [[IF:%.*]], label [[ELSE:%.*]] @@ -1291,7 +1291,7 @@ define void @dont_increase_existing_captures_scc2(ptr %p) { } define void @addr_only_scc(ptr %p) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: read, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: read, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @addr_only_scc ; FNATTRS-SAME: (ptr readonly captures(address_is_null) [[P:%.*]]) #[[ATTR20:[0-9]+]] { ; FNATTRS-NEXT: [[V:%.*]] = load i8, ptr [[P]], align 1 @@ -1314,7 +1314,7 @@ define void @addr_only_scc(ptr %p) { } define void @addr_only_scc2(ptr %p) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: read, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: read, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @addr_only_scc2 ; FNATTRS-SAME: (ptr readonly captures(address_is_null) [[P:%.*]]) #[[ATTR20]] { ; FNATTRS-NEXT: [[CMP:%.*]] = icmp ne ptr [[P]], null diff --git a/llvm/test/Transforms/FunctionAttrs/read-write-scc.ll b/llvm/test/Transforms/FunctionAttrs/read-write-scc.ll index be61990fd6278..1fc0084203fca 100644 --- a/llvm/test/Transforms/FunctionAttrs/read-write-scc.ll +++ b/llvm/test/Transforms/FunctionAttrs/read-write-scc.ll @@ -4,7 +4,7 @@ @i = global i32 0 define void @foo() { -; CHECK: Function Attrs: nofree nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) +; CHECK: Function Attrs: nofree nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; CHECK-LABEL: define {{[^@]+}}@foo ; CHECK-SAME: () #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: store i32 1, ptr @i, align 4 @@ -17,7 +17,7 @@ define void @foo() { } define void @bar() { -; CHECK: Function Attrs: nofree nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) +; CHECK: Function Attrs: nofree nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; CHECK-LABEL: define {{[^@]+}}@bar ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: [[I:%.*]] = load i32, ptr @i, align 4 diff --git a/llvm/test/Transforms/FunctionAttrs/readattrs.ll b/llvm/test/Transforms/FunctionAttrs/readattrs.ll index 87f64ed3c63bc..8fc72a1ab90b9 100644 --- a/llvm/test/Transforms/FunctionAttrs/readattrs.ll +++ b/llvm/test/Transforms/FunctionAttrs/readattrs.ll @@ -33,7 +33,7 @@ define void @test1_2(ptr %x1_2, ptr %y1_2, ptr %z1_2) { ; TODO: Missing with attributor-light: argmem: none, inaccessiblemem: none define ptr @test2(ptr %p) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define {{[^@]+}}@test2 ; FNATTRS-SAME: (ptr readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0:[0-9]+]] { ; FNATTRS-NEXT: store i32 0, ptr @x, align 4 diff --git a/llvm/test/Transforms/FunctionAttrs/writeonly.ll b/llvm/test/Transforms/FunctionAttrs/writeonly.ll index 88c6031613697..05ecb12c710ee 100644 --- a/llvm/test/Transforms/FunctionAttrs/writeonly.ll +++ b/llvm/test/Transforms/FunctionAttrs/writeonly.ll @@ -44,7 +44,7 @@ nouses-argworn-funro_entry: @d-ccc = internal global %_type_of_d-ccc <{ ptr null, i8 1, i8 13, i8 0, i8 -127 }>, align 8 define void @nouses-argworn-funwo(ptr writeonly %.aaa) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define {{[^@]+}}@nouses-argworn-funwo ; FNATTRS-SAME: (ptr readnone captures(none) [[DOTAAA:%.*]]) #[[ATTR2:[0-9]+]] { ; FNATTRS-NEXT: nouses-argworn-funwo_entry: @@ -82,7 +82,7 @@ define void @test_store(ptr %p) { @G = external global ptr define i8 @test_store_capture(ptr %p) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: read, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: read, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define {{[^@]+}}@test_store_capture ; FNATTRS-SAME: (ptr [[P:%.*]]) #[[ATTR4:[0-9]+]] { ; FNATTRS-NEXT: store ptr [[P]], ptr @G, align 8 diff --git a/llvm/test/Transforms/InferFunctionAttrs/norecurse_debug.ll b/llvm/test/Transforms/InferFunctionAttrs/norecurse_debug.ll index c8568272d320f..89a09406e5f1d 100644 --- a/llvm/test/Transforms/InferFunctionAttrs/norecurse_debug.ll +++ b/llvm/test/Transforms/InferFunctionAttrs/norecurse_debug.ll @@ -52,5 +52,5 @@ attributes #1 = { nounwind readnone speculatable } !28 = !DILocation(line: 9, column: 18, scope: !2) !29 = !DILocation(line: 10, column: 1, scope: !2) -; CHECK: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: write, inaccessiblemem: none) } +; CHECK: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: write, inaccessiblemem: none, target_mem0: none, target_mem1: none) } ; CHECK-NOT: foo.coefficient1 diff --git a/llvm/test/Transforms/LICM/AArch64/speculative-intrinsic-hoisting.ll b/llvm/test/Transforms/LICM/AArch64/speculative-intrinsic-hoisting.ll new file mode 100644 index 0000000000000..72463b07521eb --- /dev/null +++ b/llvm/test/Transforms/LICM/AArch64/speculative-intrinsic-hoisting.ll @@ -0,0 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=licm < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define i64 @sve_uaddv( %inv, i1 %c) { +; CHECK-LABEL: define i64 @sve_uaddv( +; CHECK-SAME: [[INV:%.*]], i1 [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[UADDV:%.*]] = call i64 @llvm.aarch64.sve.uaddv.nxv4i32( splat (i1 true), [[INV]]) +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[BACKEDGE_COND:%.*]] = icmp ult i64 [[IV]], [[UADDV]] +; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[C]], i1 [[BACKEDGE_COND]], i1 false +; CHECK-NEXT: br i1 [[OR_COND]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ] +; CHECK-NEXT: ret i64 [[IV_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %cond.true ] + %iv.next = add i64 %iv, 1 + br i1 %c, label %cond.true, label %exit + +cond.true: + %uaddv = call i64 @llvm.aarch64.sve.uaddv.nxv4i32( splat (i1 true), %inv) + %backedge.cond = icmp ult i64 %iv, %uaddv + br i1 %backedge.cond, label %loop, label %exit + +exit: + ret i64 %iv +} + +define i64 @sve_faddv( %inv, i1 %c) { +; CHECK-LABEL: define i64 @sve_faddv( +; CHECK-SAME: [[INV:%.*]], i1 [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[COND_TRUE:.*]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: br i1 [[C]], label %[[COND_TRUE]], label %[[EXIT:.*]] +; CHECK: [[COND_TRUE]]: +; CHECK-NEXT: [[FADDV:%.*]] = call float @llvm.aarch64.sve.faddv.nxv4f32( splat (i1 true), [[INV]]) +; CHECK-NEXT: [[IV_AS_FLOAT:%.*]] = sitofp i64 [[IV]] to float +; CHECK-NEXT: [[BACKEDGE_COND:%.*]] = fcmp olt float [[IV_AS_FLOAT]], [[FADDV]] +; CHECK-NEXT: br i1 [[BACKEDGE_COND]], label %[[LOOP]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[COND_TRUE]] ], [ [[IV]], %[[LOOP]] ] +; CHECK-NEXT: ret i64 [[IV_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %cond.true ] + %iv.next = add i64 %iv, 1 + br i1 %c, label %cond.true, label %exit + +cond.true: + %faddv = call float @llvm.aarch64.sve.faddv.nxv4i32( splat (i1 true), %inv) + %iv.as.float = sitofp i64 %iv to float + %backedge.cond = fcmp olt float %iv.as.float, %faddv + br i1 %backedge.cond, label %loop, label %exit + +exit: + ret i64 %iv +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index cb4bd793013b1..9609982b2c68f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -386,7 +386,7 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt ; DEFAULT-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], ptr [[E:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { ; DEFAULT-NEXT: [[ENTRY:.*:]] ; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 60 +; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 28 ; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; DEFAULT: [[VECTOR_MEMCHECK]]: ; DEFAULT-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[E]], i64 4 @@ -427,16 +427,16 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt ; DEFAULT: [[VECTOR_PH]]: ; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 ; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] -; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] -; DEFAULT: [[VECTOR_BODY]]: -; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE33:.*]] ] -; DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META8:![0-9]+]] +; DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META8:![0-9]+]] ; DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META11:![0-9]+]] +; DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META13:![0-9]+]] ; DEFAULT-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], [[TMP3]] -; DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META13:![0-9]+]] ; DEFAULT-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP5]] ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] +; DEFAULT: [[VECTOR_BODY]]: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE33:.*]] ] ; DEFAULT-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[D]], i64 [[INDEX]] ; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; DEFAULT: [[PRED_STORE_IF]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll index 0d8a1021bd438..50807df51c99e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll @@ -132,15 +132,15 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT: vector.ph: ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[X]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT]], <16 x i16> poison, <16 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP0:%.*]] = trunc <16 x i16> [[BROADCAST_SPLAT]] to <16 x i8> -; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] -; DEFAULT: vector.body: -; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; DEFAULT-NEXT: [[TMP1:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META6:![0-9]+]] ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i64> poison, i64 [[TMP1]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT2]], <16 x i64> poison, <16 x i32> zeroinitializer ; DEFAULT-NEXT: [[TMP2:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT3]] to <16 x i8> +; DEFAULT-NEXT: [[TMP0:%.*]] = trunc <16 x i16> [[BROADCAST_SPLAT]] to <16 x i8> ; DEFAULT-NEXT: [[TMP3:%.*]] = and <16 x i8> [[TMP2]], [[TMP0]] +; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] +; DEFAULT: vector.body: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]] ; DEFAULT-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 16 ; DEFAULT-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP4]], align 1, !alias.scope [[META9:![0-9]+]], !noalias [[META6]] @@ -156,15 +156,15 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <8 x i16> poison, i16 [[X]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT4]], <8 x i16> poison, <8 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP7:%.*]] = trunc <8 x i16> [[BROADCAST_SPLAT5]] to <8 x i8> -; DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] -; DEFAULT: vec.epilog.vector.body: -; DEFAULT-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; DEFAULT-NEXT: [[TMP8:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META6]] ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i64> poison, i64 [[TMP8]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT7]], <8 x i64> poison, <8 x i32> zeroinitializer ; DEFAULT-NEXT: [[TMP9:%.*]] = trunc <8 x i64> [[BROADCAST_SPLAT8]] to <8 x i8> +; DEFAULT-NEXT: [[TMP7:%.*]] = trunc <8 x i16> [[BROADCAST_SPLAT5]] to <8 x i8> ; DEFAULT-NEXT: [[TMP10:%.*]] = and <8 x i8> [[TMP9]], [[TMP7]] +; DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; DEFAULT: vec.epilog.vector.body: +; DEFAULT-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; DEFAULT-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX6]] ; DEFAULT-NEXT: store <8 x i8> [[TMP10]], ptr [[TMP11]], align 1, !alias.scope [[META9]], !noalias [[META6]] ; DEFAULT-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 8 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll index ed797fcd6c026..dca4f47738309 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll @@ -17,15 +17,15 @@ define void @vf_will_not_generate_any_vector_insts(ptr %src, ptr %dst) { ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[DST]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 100, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SRC]], align 4, !alias.scope [[META0:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[TMP6]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer ; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[BROADCAST_SPLAT3]], align 4 [[BROADCAST_SPLAT]], splat (i1 true), i32 [[TMP5]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]] ; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index 725fa49c0930c..b3c45a565a8fe 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -329,72 +329,85 @@ for.end: define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 { ; CHECK-LABEL: @multi_exit( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[UMAX6:%.*]] = call i64 @llvm.umax.i64(i64 [[B:%.*]], i64 1) -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX6]], -1 +; CHECK-NEXT: [[UMAX9:%.*]] = call i64 @llvm.umax.i64(i64 [[B:%.*]], i64 1) +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX9]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = freeze i64 [[TMP0]] -; CHECK-NEXT: [[UMIN7:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[A:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = add nuw i64 [[UMIN7]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 28 +; CHECK-NEXT: [[UMIN10:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = add nuw i64 [[UMIN10]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 24 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[B]], i64 1) ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[UMAX]], -1 ; CHECK-NEXT: [[TMP4:%.*]] = freeze i64 [[TMP3]] ; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP4]], i64 [[A]]) -; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[UMIN]] to i32 -; CHECK-NEXT: [[TMP7:%.*]] = add i32 1, [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[UMIN]], 4294967295 -; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] -; CHECK-NEXT: br i1 [[TMP10]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[UMIN]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 1, [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMIN]], 4294967295 +; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[UMIN]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = icmp slt i32 [[TMP10]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt i64 [[UMIN]], 4294967295 +; CHECK-NEXT: [[TMP13:%.*]] = or i1 [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP9]], [[TMP13]] +; CHECK-NEXT: br i1 [[TMP14]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 1 -; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC_1:%.*]], i64 8 ; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC_2:%.*]], i64 8 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC_1]], [[SCEVGEP]] +; CHECK-NEXT: [[UMAX3:%.*]] = call i64 @llvm.umax.i64(i64 [[B]], i64 1) +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[UMAX3]], -1 +; CHECK-NEXT: [[TMP16:%.*]] = freeze i64 [[TMP15]] +; CHECK-NEXT: [[UMIN4:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP16]], i64 [[A]]) +; CHECK-NEXT: [[TMP17:%.*]] = shl i64 [[UMIN4]], 3 +; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 8 +; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[SRC_3:%.*]], i64 [[TMP18]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC_2]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] -; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC_2]], [[SCEVGEP]] -; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] -; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: [[BOUND06:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP5]] +; CHECK-NEXT: [[BOUND17:%.*]] = icmp ult ptr [[SRC_3]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT8:%.*]] = and i1 [[BOUND06]], [[BOUND17]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT8]] ; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 4, i64 [[N_MOD_VF]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP12]] -; CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[SRC_1]], align 8, !alias.scope [[META6:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP13]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 4, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr [[SRC_2]], align 8, !alias.scope [[META6:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP21]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[SRC_2]], align 8, !alias.scope [[META9:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP14]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT9]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT10]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = and <2 x i1> [[TMP16]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = zext <2 x i1> [[TMP17]] to <2 x i8> -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i8> [[TMP18]], i32 1 -; CHECK-NEXT: store i8 [[TMP19]], ptr [[DST]], align 1, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] +; CHECK-NEXT: [[TMP22:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[TMP23:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[SRC_3]], i32 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i32 2 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP25]], align 8, !alias.scope [[META9:![0-9]+]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq <2 x i64> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = and <2 x i1> [[TMP23]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = zext <2 x i1> [[TMP27]] to <2 x i8> +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x i8> [[TMP28]], i32 1 +; CHECK-NEXT: store i8 [[TMP29]], ptr [[DST]], align 1, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL11:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV_1_WIDE:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT_WIDE:%.*]], [[LOOP_LATCH:%.*]] ] -; CHECK-NEXT: [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL8]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP_LATCH]] ] +; CHECK-NEXT: [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL11]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP_LATCH]] ] ; CHECK-NEXT: [[EC_1:%.*]] = icmp ult i64 [[IV_1_WIDE]], [[A]] ; CHECK-NEXT: br i1 [[EC_1]], label [[LOOP_LATCH]], label [[EXIT:%.*]] ; CHECK: loop.latch: +; CHECK-NEXT: [[SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC_3]], i32 [[IV_1]] ; CHECK-NEXT: [[L_1:%.*]] = load i64, ptr [[SRC_1]], align 8 ; CHECK-NEXT: [[L_2:%.*]] = load i64, ptr [[SRC_2]], align 8 ; CHECK-NEXT: [[CMP55_US:%.*]] = icmp eq i64 [[L_1]], 0 @@ -405,7 +418,7 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 { ; CHECK-NEXT: [[IV_1_NEXT]] = add i32 [[IV_1]], 1 ; CHECK-NEXT: [[IV_1_NEXT_WIDE]] = zext i32 [[IV_1_NEXT]] to i64 ; CHECK-NEXT: [[EC_2:%.*]] = icmp ult i64 [[IV_1_NEXT_WIDE]], [[B]] -; CHECK-NEXT: br i1 [[EC_2]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[EC_2]], label [[LOOP1]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -419,7 +432,8 @@ loop: br i1 %ec.1, label %loop.latch, label %exit loop.latch: - %l.1 = load i64, ptr %src.1, align 8 + %gep.src.1 = getelementptr inbounds i64, ptr %src.1, i32 %iv.1 + %l.1 = load i64, ptr %gep.src.1, align 8 %l.2 = load i64, ptr %src.2, align 8 %cmp55.us = icmp eq i64 %l.1, 0 %cmp.i.us = icmp ne i64 %l.2, 0 diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll index 63f9a1310d15a..dbd7019188d07 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll @@ -278,10 +278,10 @@ define void @uniform_copy(ptr %A, ptr %B) { ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META12:![0-9]+]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META12:![0-9]+]] ; CHECK-NEXT: store i32 [[TMP0]], ptr [[B]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META12]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 diff --git a/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll b/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll index 8615401af34f8..7bbc186dcbbae 100644 --- a/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll +++ b/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll @@ -21,12 +21,12 @@ define void @hoist_invariant_load_noalias_due_to_memchecks(ptr %dst, ptr %invari ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META0:![0-9]+]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[INDEX]] ; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-metadata.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-metadata.ll index bd0fd77e7c391..9bbd67059e84d 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-metadata.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-metadata.ll @@ -102,6 +102,9 @@ define void @ir_tbaa_different(ptr %base, ptr %end, ptr %src) { ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[SRC]], align 4, !alias.scope [[META10:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP4]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -109,9 +112,6 @@ define void @ir_tbaa_different(ptr %base, ptr %end, ptr %src) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[SRC]], align 4, !alias.scope [[META10:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x float>, ptr [[NEXT_GEP]], align 4, !alias.scope [[META13:![0-9]+]], !noalias [[META10]] ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x float> [[WIDE_VEC]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x float> [[WIDE_VEC]], <4 x float> poison, <2 x i32> diff --git a/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll b/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll index d21621e46b79c..05cfb1957a766 100644 --- a/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll @@ -61,14 +61,14 @@ define void @Test(ptr nocapture %obj, i64 %z) #0 { ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[Z]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[Z]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[OBJ]], i64 0, i32 0, i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4, !alias.scope [[META0:![0-9]+]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META3:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP13]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4, !alias.scope [[META3:![0-9]+]] ; CHECK-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] @@ -125,14 +125,14 @@ define void @Test(ptr nocapture %obj, i64 %z) #0 { ; CHECK-HOIST: vector.ph: ; CHECK-HOIST-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[Z]], 4 ; CHECK-HOIST-NEXT: [[N_VEC:%.*]] = sub i64 [[Z]], [[N_MOD_VF]] +; CHECK-HOIST-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !alias.scope [[META0:![0-9]+]] +; CHECK-HOIST-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; CHECK-HOIST-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-HOIST-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-HOIST: vector.body: ; CHECK-HOIST-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-HOIST-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[OBJ]], i64 0, i32 0, i64 [[INDEX]] -; CHECK-HOIST-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META0:![0-9]+]] -; CHECK-HOIST-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP3]], align 4, !alias.scope [[META3:![0-9]+]] -; CHECK-HOIST-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 -; CHECK-HOIST-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-HOIST-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META3:![0-9]+]] ; CHECK-HOIST-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] ; CHECK-HOIST-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[INDEX]] ; CHECK-HOIST-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/pointer-select-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/pointer-select-runtime-checks.ll index 4d50a814b621d..f49a2d90b0e84 100644 --- a/llvm/test/Transforms/LoopVectorize/pointer-select-runtime-checks.ll +++ b/llvm/test/Transforms/LoopVectorize/pointer-select-runtime-checks.ll @@ -23,17 +23,17 @@ define void @test1_select_invariant(ptr %src.1, ptr %src.2, ptr %dst, i1 %c, i8 ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP6:%.*]] = load i8, ptr [[PTR_SEL]], align 8, !alias.scope [[META0:![0-9]+]] ; CHECK-NEXT: [[IND_END:%.*]] = trunc i32 [[N_VEC]] to i8 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i8 ; CHECK-NEXT: [[INDUCTION2:%.*]] = add i8 [[OFFSET_IDX]], 1 -; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[PTR_SEL]], align 8, !alias.scope [[META0:![0-9]+]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i8 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i8 [[INDUCTION2]] -; CHECK-NEXT: store i8 [[TMP10]], ptr [[TMP11]], align 2, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] -; CHECK-NEXT: store i8 [[TMP10]], ptr [[TMP8]], align 2, !alias.scope [[META3]], !noalias [[META0]] +; CHECK-NEXT: store i8 [[TMP6]], ptr [[TMP11]], align 2, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] +; CHECK-NEXT: store i8 [[TMP6]], ptr [[TMP8]], align 2, !alias.scope [[META3]], !noalias [[META0]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/pr50686.ll b/llvm/test/Transforms/LoopVectorize/pr50686.ll index be9110ce0093a..5a56cdfefbb8f 100644 --- a/llvm/test/Transforms/LoopVectorize/pr50686.ll +++ b/llvm/test/Transforms/LoopVectorize/pr50686.ll @@ -15,17 +15,17 @@ define void @m(ptr nocapture %p, ptr nocapture %p2, i32 %q) { ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[P2]], align 4, !alias.scope [[META0:![0-9]+]] -; CHECK-NEXT: [[TMP1:%.*]] = sub nsw i32 0, [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX9_2]], align 4, !alias.scope [[META0:![0-9]+]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX9_1]], align 4, !alias.scope [[META0]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[P2]], align 4, !alias.scope [[META0]] +; CHECK-NEXT: [[TMP1:%.*]] = sub nsw i32 0, [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = sub nsw i32 [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX9_2]], align 4, !alias.scope [[META0]] ; CHECK-NEXT: [[TMP5:%.*]] = sub nsw i32 [[TMP3]], [[TMP4]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDEX]] ; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP7]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/reduction-align.ll b/llvm/test/Transforms/LoopVectorize/reduction-align.ll index 028eb3b05957d..0c45b96874da2 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-align.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-align.ll @@ -23,13 +23,13 @@ define void @fn(ptr %hbuf, ptr %ref, i32 %height) { ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[HEIGHT]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[HEIGHT]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[REF]], align 1, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[REF]], align 1, !alias.scope [[META0:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1]] = add <4 x i16> [[BROADCAST_SPLAT]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll index a1329598529fd..25f40be238338 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll @@ -1511,14 +1511,14 @@ define void @stride_check_known_via_loop_guard(ptr %C, ptr %A, i32 %Acols) { ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[ARRAYIDX_US]], align 8, !alias.scope [[META69:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds double, ptr [[C]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX_US]], align 8, !alias.scope [[META69:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[TMP1]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: store <4 x double> [[BROADCAST_SPLAT]], ptr [[TMP0]], align 8, !alias.scope [[META72:![0-9]+]], !noalias [[META69]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[C]], i32 [[INDEX]] +; CHECK-NEXT: store <4 x double> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 8, !alias.scope [[META72:![0-9]+]], !noalias [[META69]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP74:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll index 70adac2103feb..fb25b2bc7b906 100644 --- a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll +++ b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll @@ -80,13 +80,13 @@ define void @single_scalar_cast_stored(ptr %src, ptr %dst, i32 %n) { ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC]], align 2, !alias.scope [[META4:![0-9]+]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i16 [[TMP0]], 0 ; CHECK-NEXT: [[TMP4:%.*]] = and i16 [[TMP0]], 15 ; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], i16 0, i16 [[TMP4]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: store i16 [[TMP5]], ptr [[DST]], align 2, !alias.scope [[META7:![0-9]+]], !noalias [[META4]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll index e5e02674704f9..22cf860c8b58c 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll @@ -50,6 +50,7 @@ define void @expand(ptr %src, ptr %dst, i64 %0) { ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP8]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP8]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP19:%.*]] = load double, ptr [[SRC]], align 8, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] ; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer @@ -58,7 +59,6 @@ define void @expand(ptr %src, ptr %dst, i64 %0) { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP19:%.*]] = load double, ptr [[SRC]], align 8, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] ; CHECK-NEXT: [[TMP20:%.*]] = shl <4 x i64> [[VEC_IND]], splat (i64 1) ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0 ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1 diff --git a/llvm/test/Transforms/LowerTypeTests/cfi-nounwind-direct-call.ll b/llvm/test/Transforms/LowerTypeTests/cfi-nounwind-direct-call.ll index 2795333effd76..89c32fab54a4c 100644 --- a/llvm/test/Transforms/LowerTypeTests/cfi-nounwind-direct-call.ll +++ b/llvm/test/Transforms/LowerTypeTests/cfi-nounwind-direct-call.ll @@ -117,7 +117,7 @@ attributes #6 = { noreturn nounwind } ; CHECK-NEXT: ret i32 [[DOT]] ; ; -; CHECK: Function Attrs: minsize mustprogress nofree norecurse nosync nounwind optsize willreturn memory(write, argmem: none, inaccessiblemem: none) +; CHECK: Function Attrs: minsize mustprogress nofree norecurse nosync nounwind optsize willreturn memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; CHECK-LABEL: define dso_local noundef range(i32 0, 2) i32 @_Z10call_catchi ; CHECK-SAME: (i32 noundef [[NUM:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] !type [[META4]] !type [[META5]] !type [[META6]] { ; CHECK-NEXT: entry: diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll index a35bcf1c5a88d..c17b15138329c 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll @@ -8,19 +8,69 @@ target triple = "arm64-apple-macosx" define void @hoist_invariant_load(ptr %invariant_ptr, i64 %num_elements, ptr %array) { ; CHECK-LABEL: define void @hoist_invariant_load( ; CHECK-SAME: ptr readonly captures(none) [[INVARIANT_PTR:%.*]], i64 [[NUM_ELEMENTS:%.*]], ptr captures(none) [[ARRAY:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[NUM_ELEMENTS]], 0 -; CHECK-NEXT: br i1 [[CMP1_NOT]], label %[[EXIT:.*]], label %[[LOOP_LATCH:.*]] -; CHECK: [[LOOP_LATCH]]: -; CHECK-NEXT: [[I2:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP_LATCH]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br i1 [[CMP1_NOT]], label %[[EXIT:.*]], label %[[LOOP_LATCH_PREHEADER:.*]] +; CHECK: [[LOOP_LATCH_PREHEADER]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_ELEMENTS]], 11 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[LOOP_LATCH_PREHEADER6:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[NUM_ELEMENTS]], 5 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[ARRAY]], i64 [[TMP0]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP1]], i64 -24 +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[INVARIANT_PTR]], i64 8 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[ARRAY]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[INVARIANT_PTR]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[LOOP_LATCH_PREHEADER6]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[NUM_ELEMENTS]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 4, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_ELEMENTS]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[INVARIANT_PTR]], align 8, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[I2:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr nusw %"class.dealii::VectorizedArray", ptr [[ARRAY]], i64 [[I2]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr %"class.dealii::VectorizedArray", ptr [[ARRAY]], i64 [[I2]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i64 32 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr %"class.dealii::VectorizedArray", ptr [[ARRAY]], i64 [[I2]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 64 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr %"class.dealii::VectorizedArray", ptr [[ARRAY]], i64 [[I2]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i64 96 +; CHECK-NEXT: [[TMP12:%.*]] = load <5 x double>, ptr [[GEP]], align 8, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <5 x double> [[TMP12]], <5 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = load <5 x double>, ptr [[TMP9]], align 8, !alias.scope [[META3]], !noalias [[META0]] +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <5 x double> [[TMP13]], <5 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[TMP14]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[TMP14]], i64 1 +; CHECK-NEXT: [[TMP17:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC5]] +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x double> [[TMP17]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x double> [[TMP17]], i64 1 +; CHECK-NEXT: store double [[TMP15]], ptr [[GEP]], align 8, !alias.scope [[META3]], !noalias [[META0]] +; CHECK-NEXT: store double [[TMP16]], ptr [[TMP7]], align 8, !alias.scope [[META3]], !noalias [[META0]] +; CHECK-NEXT: store double [[TMP18]], ptr [[TMP9]], align 8, !alias.scope [[META3]], !noalias [[META0]] +; CHECK-NEXT: store double [[TMP19]], ptr [[TMP11]], align 8, !alias.scope [[META3]], !noalias [[META0]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[I2]], 4 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label %[[LOOP_LATCH_PREHEADER6]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[LOOP_LATCH_PREHEADER6]]: +; CHECK-NEXT: [[I2_PH:%.*]] = phi i64 [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[LOOP_LATCH_PREHEADER]] ], [ [[N_VEC]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: br label %[[LOOP_LATCH:.*]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[I3:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP_LATCH]] ], [ [[I2_PH]], %[[LOOP_LATCH_PREHEADER6]] ] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr nusw %"class.dealii::VectorizedArray", ptr [[ARRAY]], i64 [[I3]] ; CHECK-NEXT: [[INVARIANT_VAL:%.*]] = load double, ptr [[INVARIANT_PTR]], align 8 -; CHECK-NEXT: [[ARRAY_VAL:%.*]] = load double, ptr [[GEP]], align 8 +; CHECK-NEXT: [[ARRAY_VAL:%.*]] = load double, ptr [[GEP1]], align 8 ; CHECK-NEXT: [[SUM:%.*]] = fadd double [[INVARIANT_VAL]], [[ARRAY_VAL]] -; CHECK-NEXT: store double [[SUM]], ptr [[GEP]], align 8 -; CHECK-NEXT: [[I_NEXT]] = add nuw i64 [[I2]], 1 +; CHECK-NEXT: store double [[SUM]], ptr [[GEP1]], align 8 +; CHECK-NEXT: [[I_NEXT]] = add nuw i64 [[I3]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_NEXT]], [[NUM_ELEMENTS]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_LATCH]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_LATCH]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SCCP/ipscp-drop-argmemonly.ll b/llvm/test/Transforms/SCCP/ipscp-drop-argmemonly.ll index 22726e0cac1f1..6a64dc3cddd39 100644 --- a/llvm/test/Transforms/SCCP/ipscp-drop-argmemonly.ll +++ b/llvm/test/Transforms/SCCP/ipscp-drop-argmemonly.ll @@ -14,7 +14,7 @@ ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 0 ;. define internal void @ptrarg.1(ptr %arg, i32 %val) argmemonly nounwind { -; CHECK: Function Attrs: nounwind memory(readwrite, inaccessiblemem: none) +; CHECK: Function Attrs: nounwind memory(readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; CHECK-LABEL: @ptrarg.1( ; CHECK-NEXT: store i32 10, ptr @g, align 4 ; CHECK-NEXT: ret void @@ -62,7 +62,7 @@ define void @caller.2(ptr %ptr) { ; Here the pointer argument %arg will be replaced by a constant. We need to ; drop inaccessiblemem_or_argmemonly. define internal void @ptrarg.3(ptr %arg, i32 %val) inaccessiblemem_or_argmemonly nounwind { -; CHECK: Function Attrs: nounwind memory(readwrite) +; CHECK: Function Attrs: nounwind memory(readwrite, target_mem0: none, target_mem1: none) ; CHECK-LABEL: @ptrarg.3( ; CHECK-NEXT: store i32 10, ptr @g, align 4 ; CHECK-NEXT: ret void @@ -110,7 +110,7 @@ define void @caller.4(ptr %ptr) { ; Here the pointer argument %arg will be replaced by a constant. We need to ; drop inaccessiblemem_or_argmemonly. define internal void @ptrarg.5(ptr %arg, i32 %val) argmemonly inaccessiblemem_or_argmemonly nounwind { -; CHECK: Function Attrs: nounwind memory(readwrite, inaccessiblemem: none) +; CHECK: Function Attrs: nounwind memory(readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; CHECK-LABEL: @ptrarg.5( ; CHECK-NEXT: store i32 10, ptr @g, align 4 ; CHECK-NEXT: ret void @@ -163,9 +163,9 @@ define i32 @caller.6.cs.attributes(i32 %n) { } ;. -; CHECK: attributes #[[ATTR0]] = { nounwind memory(readwrite, inaccessiblemem: none) } +; CHECK: attributes #[[ATTR0]] = { nounwind memory(readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) } ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind memory(argmem: readwrite) } -; CHECK: attributes #[[ATTR2]] = { nounwind memory(readwrite) } +; CHECK: attributes #[[ATTR2]] = { nounwind memory(readwrite, target_mem0: none, target_mem1: none) } ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) } ; CHECK: attributes #[[ATTR4]] = { nounwind } ;. diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll new file mode 100644 index 0000000000000..87537c05573ae --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll @@ -0,0 +1,121 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes="function(slp-vectorizer)" -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 %s -S | FileCheck %s + +define void @test(ptr addrspace(1) %base, ptr addrspace(1) %otherA, ptr addrspace(1) %otherB) #0 { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr addrspace(1) [[BASE:%.*]], ptr addrspace(1) [[OTHERA:%.*]], ptr addrspace(1) [[OTHERB:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[P0:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 0 +; CHECK-NEXT: [[A0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 0 +; CHECK-NEXT: [[B0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[A0PTR]], align 2, !invariant.load [[META0:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[B0PTR]], align 2, !invariant.load [[META0]] +; CHECK-NEXT: [[TMP2:%.*]] = fadd reassoc <2 x half> [[TMP0]], [[TMP1]] +; CHECK-NEXT: store <2 x half> [[TMP2]], ptr addrspace(1) [[P0]], align 2 +; CHECK-NEXT: ret void +; +entry: + %p0 = getelementptr half, ptr addrspace(1) %base, i32 0 + %p1 = getelementptr half, ptr addrspace(1) %base, i32 1 + ; First pair of invariant loads from otherA. + %A0PTR = getelementptr half, ptr addrspace(1) %otherA, i32 0 + %B0PTR = getelementptr half, ptr addrspace(1) %otherB, i32 0 + %A0 = load half, ptr addrspace(1) %A0PTR, align 2, !invariant.load !0 + %B0 = load half, ptr addrspace(1) %B0PTR, align 2, !invariant.load !0 + %add0 = fadd reassoc half %A0, %B0 + store half %add0, ptr addrspace(1) %p0, align 2 + %A1PTR = getelementptr half, ptr addrspace(1) %otherA, i32 1 + %B1PTR = getelementptr half, ptr addrspace(1) %otherB, i32 1 + %A1 = load half, ptr addrspace(1) %A1PTR, align 2, !invariant.load !0 + %B1 = load half, ptr addrspace(1) %B1PTR, align 2, !invariant.load !0 + %add1 = fadd reassoc half %A1, %B1 + store half %add1, ptr addrspace(1) %p1, align 2 + ret void +} + + +define void @aliastest(ptr addrspace(1) %base, ptr addrspace(1) %otherA, ptr addrspace(1) %otherB) #0 { +; CHECK-LABEL: define void @aliastest( +; CHECK-SAME: ptr addrspace(1) [[BASE:%.*]], ptr addrspace(1) [[OTHERA:%.*]], ptr addrspace(1) [[OTHERB:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[P0:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 0 +; CHECK-NEXT: [[P1:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 1 +; CHECK-NEXT: [[A0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 0 +; CHECK-NEXT: [[B0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 0 +; CHECK-NEXT: [[A0:%.*]] = load half, ptr addrspace(1) [[A0PTR]], align 2 +; CHECK-NEXT: [[B0:%.*]] = load half, ptr addrspace(1) [[B0PTR]], align 2 +; CHECK-NEXT: [[ADD0:%.*]] = fadd reassoc half [[A0]], [[B0]] +; CHECK-NEXT: store half [[ADD0]], ptr addrspace(1) [[P0]], align 2 +; CHECK-NEXT: [[A1PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 1 +; CHECK-NEXT: [[B1PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 1 +; CHECK-NEXT: [[A1:%.*]] = load half, ptr addrspace(1) [[A1PTR]], align 2 +; CHECK-NEXT: [[B1:%.*]] = load half, ptr addrspace(1) [[B1PTR]], align 2 +; CHECK-NEXT: [[ADD1:%.*]] = fadd reassoc half [[A1]], [[B1]] +; CHECK-NEXT: store half [[ADD1]], ptr addrspace(1) [[P1]], align 2 +; CHECK-NEXT: ret void +; +entry: + %p0 = getelementptr half, ptr addrspace(1) %base, i32 0 + %p1 = getelementptr half, ptr addrspace(1) %base, i32 1 + ; First pair of invariant loads from otherA. + %A0PTR = getelementptr half, ptr addrspace(1) %otherA, i32 0 + %B0PTR = getelementptr half, ptr addrspace(1) %otherB, i32 0 + %A0 = load half, ptr addrspace(1) %A0PTR, align 2 + %B0 = load half, ptr addrspace(1) %B0PTR, align 2 + %add0 = fadd reassoc half %A0, %B0 + store half %add0, ptr addrspace(1) %p0, align 2 + %A1PTR = getelementptr half, ptr addrspace(1) %otherA, i32 1 + %B1PTR = getelementptr half, ptr addrspace(1) %otherB, i32 1 + %A1 = load half, ptr addrspace(1) %A1PTR, align 2 + %B1 = load half, ptr addrspace(1) %B1PTR, align 2 + %add1 = fadd reassoc half %A1, %B1 + store half %add1, ptr addrspace(1) %p1, align 2 + ret void +} + +define void @voltest(ptr addrspace(1) %base, ptr addrspace(1) %otherA, ptr addrspace(1) %otherB) #0 { +; CHECK-LABEL: define void @voltest( +; CHECK-SAME: ptr addrspace(1) [[BASE:%.*]], ptr addrspace(1) [[OTHERA:%.*]], ptr addrspace(1) [[OTHERB:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[P0:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 0 +; CHECK-NEXT: [[P1:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 1 +; CHECK-NEXT: [[A0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 0 +; CHECK-NEXT: [[B0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 0 +; CHECK-NEXT: [[A0:%.*]] = load volatile half, ptr addrspace(1) [[A0PTR]], align 2, !invariant.load [[META0]] +; CHECK-NEXT: [[B0:%.*]] = load volatile half, ptr addrspace(1) [[B0PTR]], align 2, !invariant.load [[META0]] +; CHECK-NEXT: [[ADD0:%.*]] = fadd reassoc half [[A0]], [[B0]] +; CHECK-NEXT: store half [[ADD0]], ptr addrspace(1) [[P0]], align 2 +; CHECK-NEXT: [[A1PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 1 +; CHECK-NEXT: [[B1PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 1 +; CHECK-NEXT: [[A1:%.*]] = load volatile half, ptr addrspace(1) [[A1PTR]], align 2, !invariant.load [[META0]] +; CHECK-NEXT: [[B1:%.*]] = load volatile half, ptr addrspace(1) [[B1PTR]], align 2, !invariant.load [[META0]] +; CHECK-NEXT: [[ADD1:%.*]] = fadd reassoc half [[A1]], [[B1]] +; CHECK-NEXT: store half [[ADD1]], ptr addrspace(1) [[P1]], align 2 +; CHECK-NEXT: ret void +; +entry: + %p0 = getelementptr half, ptr addrspace(1) %base, i32 0 + %p1 = getelementptr half, ptr addrspace(1) %base, i32 1 + ; First pair of invariant loads from otherA. + %A0PTR = getelementptr half, ptr addrspace(1) %otherA, i32 0 + %B0PTR = getelementptr half, ptr addrspace(1) %otherB, i32 0 + %A0 = load volatile half, ptr addrspace(1) %A0PTR, align 2, !invariant.load !0 + %B0 = load volatile half, ptr addrspace(1) %B0PTR, align 2, !invariant.load !0 + %add0 = fadd reassoc half %A0, %B0 + store half %add0, ptr addrspace(1) %p0, align 2 + %A1PTR = getelementptr half, ptr addrspace(1) %otherA, i32 1 + %B1PTR = getelementptr half, ptr addrspace(1) %otherB, i32 1 + %A1 = load volatile half, ptr addrspace(1) %A1PTR, align 2, !invariant.load !0 + %B1 = load volatile half, ptr addrspace(1) %B1PTR, align 2, !invariant.load !0 + %add1 = fadd reassoc half %A1, %B1 + store half %add1, ptr addrspace(1) %p1, align 2 + ret void +} + + +attributes #0 = { nounwind } + +!0 = !{} +;. +; CHECK: [[META0]] = !{} +;. diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp index 217e521b2e43e..cf5200a73e5cc 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp @@ -17,9 +17,9 @@ #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX, LLVM_ENABLE_THREADS #include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h" #include "llvm/ExecutionEngine/Orc/COFFPlatform.h" -#include "llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h" #include "llvm/ExecutionEngine/Orc/Debugging/DebugInfoSupport.h" #include "llvm/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.h" +#include "llvm/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.h" #include "llvm/ExecutionEngine/Orc/Debugging/PerfSupportPlugin.h" #include "llvm/ExecutionEngine/Orc/Debugging/VTuneSupportPlugin.h" #include "llvm/ExecutionEngine/Orc/EHFrameRegistrationPlugin.h" @@ -1298,8 +1298,8 @@ Session::Session(std::unique_ptr EPC, Error &Err) ObjLayer.addPlugin(ExitOnErr(EHFrameRegistrationPlugin::Create(ES))); if (DebuggerSupport) { Error TargetSymErr = Error::success(); - auto Plugin = std::make_unique(ES, true, true, - TargetSymErr); + auto Plugin = + std::make_unique(ES, true, true, TargetSymErr); if (!TargetSymErr) ObjLayer.addPlugin(std::move(Plugin)); else diff --git a/llvm/unittests/Support/ModRefTest.cpp b/llvm/unittests/Support/ModRefTest.cpp index 9c13908da44bb..128501bf2d957 100644 --- a/llvm/unittests/Support/ModRefTest.cpp +++ b/llvm/unittests/Support/ModRefTest.cpp @@ -21,7 +21,7 @@ TEST(ModRefTest, PrintMemoryEffects) { raw_string_ostream OS(S); OS << MemoryEffects::none(); EXPECT_EQ(S, "ArgMem: NoModRef, InaccessibleMem: NoModRef, ErrnoMem: " - "NoModRef, Other: NoModRef"); + "NoModRef, Other: NoModRef, TargetMem0: NoModRef, TargetMem1: NoModRef"); } } // namespace diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 0e76c64f09f59..3842ba235ead3 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -996,7 +996,7 @@ TEST_F(VPRecipeTest, CastVPInstructionToVPUser) { VPValue *Op2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPInstruction Recipe(Instruction::Add, {Op1, Op2}); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); } TEST_F(VPRecipeTest, CastVPWidenRecipeToVPUser) { @@ -1011,7 +1011,7 @@ TEST_F(VPRecipeTest, CastVPWidenRecipeToVPUser) { Args.push_back(Op2); VPWidenRecipe WidenR(*AI, Args, VPIRMetadata(), DebugLoc()); - checkVPRecipeCastImpl(&WidenR); + checkVPRecipeCastImpl(&WidenR); delete AI; } @@ -1030,7 +1030,7 @@ TEST_F(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) { Args.push_back(CalledFn); VPWidenCallRecipe Recipe(Call, Fn, Args); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); VPValue *VPV = &Recipe; EXPECT_TRUE(VPV->getDefiningRecipe()); @@ -1056,7 +1056,8 @@ TEST_F(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) { VPWidenSelectRecipe WidenSelectR(*SelectI, make_range(Args.begin(), Args.end())); - checkVPRecipeCastImpl(&WidenSelectR); + checkVPRecipeCastImpl( + &WidenSelectR); VPValue *VPV = &WidenSelectR; EXPECT_EQ(&WidenSelectR, VPV->getDefiningRecipe()); @@ -1094,7 +1095,7 @@ TEST_F(VPRecipeTest, CastVPWidenCastRecipeToVPUser) { VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPWidenCastRecipe Recipe(Instruction::ZExt, Op1, Int64, *Cast, {}); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); delete Cast; } @@ -1105,7 +1106,7 @@ TEST_F(VPRecipeTest, CastVPWidenIntrinsicRecipeToVPUser) { VPValue *Op2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPWidenIntrinsicRecipe Recipe(Intrinsic::smax, {Op1, Op2}, Int32); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); } TEST_F(VPRecipeTest, CastVPBlendRecipeToVPUser) { @@ -1135,7 +1136,7 @@ TEST_F(VPRecipeTest, CastVPInterleaveRecipeToVPUser) { InterleaveGroup IG(4, false, Align(4)); VPInterleaveRecipe Recipe(&IG, Addr, {}, Mask, false, {}, DebugLoc()); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); } TEST_F(VPRecipeTest, CastVPReplicateRecipeToVPUser) { @@ -1151,7 +1152,7 @@ TEST_F(VPRecipeTest, CastVPReplicateRecipeToVPUser) { auto *Call = CallInst::Create(FTy, PoisonValue::get(FTy)); VPReplicateRecipe Recipe(Call, make_range(Args.begin(), Args.end()), true); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); delete Call; } @@ -1175,7 +1176,7 @@ TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) { VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {}, {}); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); VPValue *VPV = Recipe.getVPSingleValue(); EXPECT_TRUE(isa(VPV->getDefiningRecipe())); @@ -1194,7 +1195,7 @@ TEST_F(VPRecipeTest, CastVPInterleaveEVLRecipeToVPUser) { VPInterleaveRecipe BaseRecipe(&IG, Addr, {}, Mask, false, {}, DebugLoc()); VPInterleaveEVLRecipe Recipe(BaseRecipe, *EVL, Mask); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); } TEST_F(VPRecipeTest, CastVPWidenLoadEVLRecipeToVPUser) { @@ -1209,7 +1210,7 @@ TEST_F(VPRecipeTest, CastVPWidenLoadEVLRecipeToVPUser) { VPWidenLoadRecipe BaseLoad(*Load, Addr, Mask, true, false, {}, {}); VPWidenLoadEVLRecipe Recipe(BaseLoad, Addr, *EVL, Mask); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); delete Load; } @@ -1225,7 +1226,7 @@ TEST_F(VPRecipeTest, CastVPWidenStoreRecipeToVPUser) { VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPWidenStoreRecipe Recipe(*Store, Addr, StoredVal, Mask, true, false, {}, {}); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); delete Store; } @@ -1244,7 +1245,7 @@ TEST_F(VPRecipeTest, CastVPWidenStoreEVLRecipeToVPUser) { {}); VPWidenStoreEVLRecipe Recipe(BaseStore, Addr, *EVL, Mask); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); delete Store; } diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp index 228969ab37f85..d90fcc25502e2 100644 --- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp +++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp @@ -13,6 +13,7 @@ #include "CodeGenIntrinsics.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/TableGen/Error.h" @@ -377,7 +378,19 @@ void CodeGenIntrinsic::setProperty(const Record *R) { ME &= MemoryEffects::argMemOnly(); else if (R->getName() == "IntrInaccessibleMemOnly") ME &= MemoryEffects::inaccessibleMemOnly(); - else if (R->getName() == "IntrInaccessibleMemOrArgMemOnly") + else if (R->isSubClassOf("IntrRead")) { + MemoryEffects ReadMask = MemoryEffects::writeOnly(); + for (const Record *RLoc : R->getValueAsListOfDefs("MemLoc")) + ReadMask = ReadMask.getWithModRef(getValueAsIRMemLocation(RLoc), + ModRefInfo::ModRef); + ME &= ReadMask; + } else if (R->isSubClassOf("IntrWrite")) { + MemoryEffects WriteMask = MemoryEffects::readOnly(); + for (const Record *WLoc : R->getValueAsListOfDefs("MemLoc")) + WriteMask = WriteMask.getWithModRef(getValueAsIRMemLocation(WLoc), + ModRefInfo::ModRef); + ME &= WriteMask; + } else if (R->getName() == "IntrInaccessibleMemOrArgMemOnly") ME &= MemoryEffects::inaccessibleOrArgMemOnly(); else if (R->getName() == "Commutative") isCommutative = true; @@ -477,6 +490,22 @@ void CodeGenIntrinsic::setProperty(const Record *R) { } } +llvm::IRMemLocation +CodeGenIntrinsic::getValueAsIRMemLocation(const Record *R) const { + StringRef Name = R->getName(); + IRMemLocation Loc = + StringSwitch(Name) + .Case("TargetMem0", IRMemLocation::TargetMem0) + .Case("TargetMem1", IRMemLocation::TargetMem1) + .Case("InaccessibleMem", IRMemLocation::InaccessibleMem) + .Default(IRMemLocation::Other); // fallback enum + + if (Loc == IRMemLocation::Other) + PrintFatalError(R->getLoc(), "unknown IRMemLocation: " + Name); + + return Loc; +} + bool CodeGenIntrinsic::isParamAPointer(unsigned ParamIdx) const { if (ParamIdx >= IS.ParamTys.size()) return false; diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h index 6ac6f734326d8..305260a7ef4a9 100644 --- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h +++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h @@ -186,6 +186,8 @@ struct CodeGenIntrinsic { bool isParamImmArg(unsigned ParamIdx) const; + llvm::IRMemLocation getValueAsIRMemLocation(const Record *R) const; + CodeGenIntrinsic(const Record *R, const CodeGenIntrinsicContext &Ctx); }; diff --git a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp index 3ac23185ef91c..9fed5920a019f 100644 --- a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp +++ b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp @@ -599,10 +599,10 @@ static AttributeSet getIntrinsicFnAttributeSet(LLVMContext &C, unsigned ID) { if (!UniqFnAttributes.try_emplace(&Int, ID).second) continue; OS << formatv(R"( - case {}: + case {}: // {} return AttributeSet::get(C, {{ )", - ID); + ID, Int.Name); auto addAttribute = [&OS](StringRef Attr) { OS << formatv(" Attribute::get(C, Attribute::{}),\n", Attr); }; diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn index 0034cd9993b88..ab3b717eed69d 100644 --- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn @@ -22,7 +22,6 @@ static_library("Orc") { "CompileOnDemandLayer.cpp", "CompileUtils.cpp", "Core.cpp", - "DebugObjectManagerPlugin.cpp", "DebugUtils.cpp", "EHFrameRegistrationPlugin.cpp", "ELFNixPlatform.cpp", diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/Debugging/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/Debugging/BUILD.gn index 5610679ff333e..a054e45b22ce5 100644 --- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/Debugging/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/Debugging/BUILD.gn @@ -11,6 +11,7 @@ static_library("Debugging") { "DebugInfoSupport.cpp", "DebuggerSupport.cpp", "DebuggerSupportPlugin.cpp", + "ELFDebugObjectPlugin.cpp", "LLJITUtilsCBindings.cpp", "PerfSupportPlugin.cpp", "VTuneSupportPlugin.cpp", diff --git a/mlir/docs/Dialects/NVVM/_index.md b/mlir/docs/Dialects/NVVM/_index.md new file mode 100644 index 0000000000000..f4832f76f86ad --- /dev/null +++ b/mlir/docs/Dialects/NVVM/_index.md @@ -0,0 +1,84 @@ +# NVVM Dialect + +The NVVM dialect is MLIR's LLVM-IR-based, NVIDIA-specific backend dialect. It +models NVVM intrinsics and public ISA functionality and introduces NVIDIA +extensions to the MLIR/LLVM type system and address spaces (e.g., global, +shared, and cluster memory), enabling faithful lowering of GPU kernels to the +NVPTX toolchain. While a NVVM op usually maps to a single LLVM IR intrinsic, +the NVVM dialect uses type polymorphism and other attributes so that a single +NVVM op can map to different LLVM intrinsics. + +## Scope and Capabilities + +The dialect covers core GPU features such as thread/block builtins, barriers +and atomics, warp-level collectives (e.g., shuffle/vote), matrix/tensor core +operations (e.g., `mma.sync`, `wgmma`), tensor memory accelerator (TMA) +operations, asynchronous copies (`cp.async`, bulk/tensor variants) with memory +barriers, cache and prefetch controls, and NVVM-specific attributes and enums +(e.g., FP rounding modes, memory scopes, and MMA types/layouts). + +## Placement in the Lowering Pipeline + +NVVM sits below target-agnostic dialects like `gpu` and NVIDIA's `nvgpu`. +Typical pipelines convert `gpu`/`nvgpu` ops into NVVM using +`-convert-gpu-to-nvvm` and `-convert-nvgpu-to-nvvm`, then translate into LLVM +for final code generation via NVPTX backend. + +## Target Configuration and Serialization + +NVVM provides a `#nvvm.target` attribute to describe the GPU target (SM, +features, and flags). In conjunction with `gpu` serialization (e.g., +`gpu-module-to-binary`), this enables producing architecture-specific GPU +binaries (such as CUBIN) from nested GPU modules. + +## Inline PTX + +When an intrinsic is unavailable or a performance-critical sequence must be +expressed directly, NVVM provides an `nvvm.inline_ptx` op to embed PTX inline +as a last-resort escape hatch, with explicit operands and results. + +## Memory Spaces + +The NVVM dialect introduces the following memory spaces, each with distinct +scopes and lifetimes: + +| Memory Space | Address Space | Scope | +|-------------------|---------------|----------------------| +| `generic` | 0 | All threads | +| `global` | 1 | All threads (device) | +| `shared` | 3 | Thread block (CTA) | +| `constant` | 4 | All threads | +| `local` | 5 | Single thread | +| `tensor` | 6 | Thread block (CTA) | +| `shared_cluster` | 7 | Thread block cluster | + +### Memory Space Details + +- **generic**: Can point to any memory space; requires runtime resolution of + actual address space. Use when pointer origin is unknown at compile time. + Performance varies based on the underlying memory space. +- **global**: Accessible by all threads across all blocks; persists across + kernel launches. Highest latency but largest capacity (device memory). Best + for large data and inter-kernel communication. +- **shared**: Shared within a thread block (CTA); very fast on-chip memory for + cooperation between threads in the same block. Limited capacity. Ideal for + block-level collaboration, caching, and reducing global memory traffic. +- **constant**: Read-only memory cached per SM. Size typically limited to 64KB. + Best for read-only data and uniform values accessed by all threads. +- **local**: Private to each thread. Use for per-thread private data and + automatic variables that don't fit in registers. +- **tensor**: Special memory space for tensor core operations. Used by + `tcgen05` instructions on SM 100+ for tensor input/output operations. +- **shared_cluster**: Distributed shared memory across thread blocks within a + cluster (SM 90+). Enables collaboration beyond single-block scope with fast + access across cluster threads. + + +## Non-Goals + +NVVM is not a place for convenience or "wrapper" ops. It is not intended to +introduce high-level ops that expand into multiple unrelated NVVM intrinsics or +that lower to no intrinsic at all. Such abstractions belong in higher-level +dialects (e.g., `nvgpu`, `gpu`, or project-specific dialects). The design +intent is a thin, predictable, low-level surface with near-mechanical lowering +to NVVM/LLVM IR. \ No newline at end of file diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index d5665b439b059..d2e598e5a0bf6 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -628,6 +628,8 @@ def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> { /*default=*/"false", "Replace memref arguments in GPU functions with bare pointers. " "All memrefs must have static shape.">, + Option<"allowPatternRollback", "allow-pattern-rollback", "bool", "true", + "Experimental performance flag to disallow pattern rollback">, ListOption<"allowedDialects", "allowed-dialects", "std::string", "Run conversion patterns of only the specified dialects">, ]; @@ -1084,6 +1086,10 @@ def SCFToControlFlowPass : Pass<"convert-scf-to-cf"> { let summary = "Convert SCF dialect to ControlFlow dialect, replacing structured" " control flow with a CFG"; let dependentDialects = ["cf::ControlFlowDialect"]; + let options = [ + Option<"allowPatternRollback", "allow-pattern-rollback", "bool", "true", + "Experimental performance flag to disallow pattern rollback"> + ]; } //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h index fccb49d49da70..34c85de3418ec 100644 --- a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h @@ -58,6 +58,10 @@ struct GPUToNVVMPipelineOptions "Whether to use the bareptr calling convention on the host (warning " "this should be false until the GPU layering is fixed)"), llvm::cl::init(false)}; + PassOptions::Option allowPatternRollback{ + *this, "allow-pattern-rollback", + llvm::cl::desc("Allow pattern rollback during dialect conversion"), + llvm::cl::init(true)}; }; // Options for the gpu to xevm pipeline. diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 6e3a92b5bde42..524b9f820f290 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -37,84 +37,6 @@ def LLVM_PointerSharedCluster : LLVM_PointerInAddressSpace<7>; //===----------------------------------------------------------------------===// def NVVM_Dialect : Dialect { - let summary = "The NVVM dialect that models NVIDIA's public ISA"; - - let description = [{ - The NVVM dialect is MLIR's LLVM-IR-based, NVIDIA-specific backend dialect. It - models NVVM intrinsics and public ISA functionality and introduces NVIDIA - extensions to the MLIR/LLVM type system and address spaces (e.g., global, - shared, and cluster memory), enabling faithful lowering of GPU kernels to the - NVPTX toolchain. While a NVVM op usually maps to a single LLVM IR intrinsic, - the NVVM dialect uses type polymorphism and other attributes so that a single - NVVM op can map to different LLVM intrinsics. - - **Scope and capabilities:** The dialect covers core GPU features such as - thread/block builtins, barriers and atomics, warp-level collectives (e.g., - shuffle/vote), matrix/tensor core operations (e.g., `mma.sync`, `wgmma`), - tensor memory accelerator (TMA) operations, asynchronous copies (`cp.async`, - bulk/tensor variants) with memory barriers, cache and prefetch controls, and - NVVM-specific attributes and enums (e.g., FP rounding modes, memory scopes, - and MMA types/layouts). - - **Non-goals:** NVVM is not a place for convenience or “wrapper” ops. It is - not intended to introduce high-level ops that expand into multiple unrelated - NVVM intrinsics or that lower to no intrinsic at all. Such abstractions belong - in higher-level dialects (e.g., `nvgpu`, `gpu`, or project-specific dialects). - The design intent is a thin, predictable, low-level surface with - near-mechanical lowering to NVVM/LLVM IR. - - **Placement in the lowering pipeline:** NVVM sits below target-agnostic - dialects like `gpu` and NVIDIA's `nvgpu`. Typical pipelines convert - `gpu`/`nvgpu` ops into NVVM using `-convert-gpu-to-nvvm` and - `-convert-nvgpu-to-nvvm`, then translate into LLVM for final code - generation via NVPTX backend. - - **Target configuration and serialization:** NVVM provides a `#nvvm.target` - attribute to describe the GPU target (SM, features, and flags). In - conjunction with `gpu` serialization (e.g., `gpu-module-to-binary`), this - enables producing architecture-specific GPU binaries (such as CUBIN) from - nested GPU modules. - - **Inline PTX:** When an intrinsic is unavailable or a performance-critical - sequence must be expressed directly, NVVM provides an `nvvm.inline_ptx` op to - embed PTX inline as a last-resort escape hatch, with explicit operands and - results. - - - **Memory Spaces:** The NVVM dialect introduces the following memory spaces, - each with distinct scopes and lifetimes: -``` - | Memory Space | Address Space | Scope | Lifetime | - |-------------------|---------------|----------------------|-------------------| - | `generic` | 0 | All threads | Context-dependent | - | `global` | 1 | All threads (device) | Application | - | `shared` | 3 | Thread block (CTA) | Kernel execution | - | `constant` | 4 | All threads (RO) | Application | - | `local` | 5 | Single thread | Kernel execution | - | `tensor` | 6 | Thread block (CTA) | Kernel execution | - | `shared_cluster` | 7 | Thread block cluster | Kernel execution | -``` - **Memory Space Details:** - - **generic**: Can point to any memory space; requires runtime resolution of - actual address space. Use when pointer origin is unknown at compile time. - Performance varies based on the underlying memory space. - - **global**: Accessible by all threads across all blocks; persists across - kernel launches. Highest latency but largest capacity (device memory). Best - for large data and inter-kernel communication. - - **shared**: Shared within a thread block (CTA); very fast on-chip memory for - cooperation between threads in the same block. Limited capacity. Ideal for - block-level collaboration, caching, and reducing global memory traffic. - - **constant**: Read-only memory cached per SM. Size typically limited to - 64KB. Best for read-only data and uniform values accessed by all threads. - - **local**: Private to each thread. Use for per-thread private data and - automatic variables that don't fit in registers. - - **tensor**: Special memory space for tensor core operations. Used by - `tcgen05` instructions on SM 100+ for tensor input/output operations. - - **shared_cluster**: Distributed shared memory across thread blocks within - a cluster (SM 90+). Enables collaboration beyond single-block scope with - fast access across cluster threads. - }]; - let name = "nvvm"; let cppNamespace = "::mlir::NVVM"; let dependentDialects = ["LLVM::LLVMDialect"]; @@ -4676,6 +4598,551 @@ def NVVM_ClusterLaunchControlQueryCancelOp }]; } +//===----------------------------------------------------------------------===// +// NVVM tcgen05.mma Ops +//===----------------------------------------------------------------------===// + +def Tcgen05MMAKindF16 : I32EnumAttrCase<"F16", 0, "f16">; +def Tcgen05MMAKindTF32 : I32EnumAttrCase<"TF32", 1, "tf32">; +def Tcgen05MMAKindF8F6F4 : I32EnumAttrCase<"F8F6F4", 2, "f8f6f4">; +def Tcgen05MMAKindINT8 : I32EnumAttrCase<"I8", 3, "i8">; + +def Tcgen05MMAKind : I32EnumAttr< + "Tcgen05MMAKind", + "tcgen05 MMA Supported Types", + [Tcgen05MMAKindF8F6F4, Tcgen05MMAKindINT8, Tcgen05MMAKindF16, + Tcgen05MMAKindTF32]> { + let cppNamespace = "::mlir::NVVM"; + let genSpecializedAttr = 0; +} + +def Tcgen05MMAKindAttr : EnumAttr { + let description = [{ + The Tcgen05MMAKind attribute describes the allowed set of types for matrix A and B in the tcgen05.mma.{sp} Op. The following are supported types for each kind: + + ``` + +-------------+--------------------------------------------+ + | Matrix Kind | supported types for A / B | + +-------------+--------------------------------------------+ + | f16 | f16, bf16 | + | tf32 | tf32 | + | f8f6f4 | e4m3, e5m2, e2m3, e3m2, e2m1 | + | i8 | unsigned 8b, signed 8b | + +-------------+--------------------------------------------+ + ``` + }]; + let assemblyFormat = "`<` $value `>`"; +} + +def Tcgen05MMACollectorOpDiscard : I32EnumAttrCase<"DISCARD", 0, "discard">; +def Tcgen05MMACollectorOpLastUse : I32EnumAttrCase<"LASTUSE", 1, "lastuse">; +def Tcgen05MMACollectorOpFill : I32EnumAttrCase<"FILL", 2, "fill">; +def Tcgen05MMACollectorOpUse : I32EnumAttrCase<"USE", 3, "use">; + +def Tcgen05MMACollectorOp : I32EnumAttr< + "Tcgen05MMACollectorOp", + "tcgen05.mma Collector Buffer Operation", + [Tcgen05MMACollectorOpDiscard, + Tcgen05MMACollectorOpLastUse, + Tcgen05MMACollectorOpFill, + Tcgen05MMACollectorOpUse]> { + let cppNamespace = "::mlir::NVVM"; + let genSpecializedAttr = 0; +} + +def Tcgen05MMACollectorOpAttr : EnumAttr { + let description = [{ + Tcgen05MMACollectorOp attribute specifies the collector buffer operations. + The following are the supported operations: + * discard : Release buffer after use (default) + * lastuse : Mark buffer for last use + * fill : Fill buffer + * use : Use buffer without modification + }]; + let assemblyFormat = "`<` $value `>`"; +} + +def NVVM_Tcgen05MMAOp : NVVM_Op<"tcgen05.mma", + [AttrSizedOperandSegments, + NVVMRequiresSMa<[100, 110]>]> { + let summary = "Performs MMA operation on 5th-gen tensor cores"; + + let description = [{ + The `tcgen05.mma` operation is an asynchronous tensor core instruction that + performs matrix multiplication, accumulation in a single fused operation. It + targets 5th-generation tensor cores, providing developers with fine-grained + control over execution and scheduling. + + ``` + D = A * B + (D * 2^ -scaleInputD) // if `scaleInputD` is provided + D = A * B // if `enableInputD` is false + D = A * B + D // otherwise + ``` + + where: + - A is an `M x K` matrix in tensor memory or described using shared memory descriptor + - B is a `K x N` matrix described using shared memory descriptor + - D is an `M x N` accumulator matrix in tensor memory + + The `shared memory descriptor` can be generated using `tcgen05.mma_smem_desc` Op + + - idesc is a 32-bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor) + + Optional Operands: + - `scaleInputD` is an Immediate value operand used for scaling D matrix by 2 ^ (-scaleInputD). The valid range is [0, 15] + + - `disableOutputLane` is a vector mask for selective output + * vector<4 x i32> when ctaGroup is CTA_1 + * vector<8 x i32> when ctaGroup is CTA_2 + + Required Attributes: + - `kind` is a Tcgen05MMAKind attribute + + - `ctaGroup` specifies CTA group configuration + * cta_1: MMA will be performed on the current thread's CTA + * cta_2: MMA will be performed on the current thread and it's peer CTA + + Default Attributes: + - collectorOp is a Tcgen05MMACollectorOp attribute with matrix A as the collector buffer + + - `aShift` shifts the rows of the A matrix down by one row and can only be + applied if A is in tensor memory + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma) + }]; + + let arguments = (ins + Tcgen05MMAKindAttr:$kind, + CTAGroupKindAttr:$ctaGroup, + DefaultValuedAttr:$collectorOp, + UnitAttr:$aShift, + LLVM_PointerTensor:$matrixD, + AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA, + I64:$matrixB, + I32:$idesc, + I1:$enableInputD, + Optional:$scaleInputD, + Optional>:$disableOutputLane + ); + + let assemblyFormat = [{ + $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD (`scale` `=` $scaleInputD^)? + (`mask` `=` $disableOutputLane^)? attr-dict `:` `(` type(operands) `)` + }]; + + let hasVerifier = true; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder); + }]; + + let llvmBuilder = [{ + auto [ID, args] = NVVM::Tcgen05MMAOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, ID, args); + }]; +} + +def NVVM_Tcgen05MMASparseOp : NVVM_Op<"tcgen05.mma.sp", + [AttrSizedOperandSegments, + NVVMRequiresSMa<[100, 110]>]> { + let summary = "Performs MMA operation with sparse A matrix on 5th-gen tensor cores"; + + let description = [{ + The `tcgen05.mma.sp` operation is an asynchronous tensor core instruction + that performs matrix multiplication, accumulation with sparse `A` matrix in + a single fused operation. It targets 5th-generation tensor cores, providing + developers with fine-grained control over execution and scheduling. + + ``` + D = A * B + (D * 2^ -scaleInputD) // if `scaleInputD` is provided + D = A * B // if `enableInputD` is false + D = A * B + D // otherwise + ``` + + where: + - A is an `M x (K / 2)` matrix in tensor memory or described using shared memory descriptor + - B is a `K x N` matrix described using shared memory descriptor + - D is an `M x N` accumulator matrix in tensor memory + - sparseMetadata located in tensor memory specifies the mapping of the `K / 2` + non-zero elements to the K elements before performing the MMA operation + + Other attributes and operands are similar to that of tcgen05.mma Op + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-sp) + }]; + + let arguments = (ins + Tcgen05MMAKindAttr:$kind, + CTAGroupKindAttr:$ctaGroup, + DefaultValuedAttr:$collectorOp, + UnitAttr:$aShift, + LLVM_PointerTensor:$matrixD, + AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA, + I64:$matrixB, + I32:$idesc, + I1:$enableInputD, + LLVM_PointerTensor:$sparseMetadata, + Optional:$scaleInputD, + Optional>:$disableOutputLane + ); + + let assemblyFormat = [{ + $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD `,` $sparseMetadata (`scale` `=` $scaleInputD^)? (`mask` `=` $disableOutputLane^)? attr-dict `:` `(` type(operands) `)` + }]; + + let hasVerifier = true; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder); + }]; + + let llvmBuilder = [{ + auto [ID, args] = NVVM::Tcgen05MMASparseOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, ID, args); + }]; +} + +def Tcgen05MMAKindMXF8F6F4 : I32EnumAttrCase<"MXF8F6F4", 0, "mxf8f6f4">; +def Tcgen05MMAKindMXF4 : I32EnumAttrCase<"MXF4", 1, "mxf4">; +def Tcgen05MMAKindMXF4NVF4 : I32EnumAttrCase<"MXF4NVF4", 2, "mxf4nvf4">; + +def Tcgen05MMABlockScaleKind : I32EnumAttr< + "Tcgen05MMABlockScaleKind", + "tcgen05.mma.block_scale supported types", + [Tcgen05MMAKindMXF8F6F4, Tcgen05MMAKindMXF4, Tcgen05MMAKindMXF4NVF4]> { + let cppNamespace = "::mlir::NVVM"; + let genSpecializedAttr = 0; +} + +def Tcgen05MMABlockScaleKindAttr : EnumAttr { + let description = [{ + The Tcgen05MMABlockScaleKind attribute describes the allowed set of types for matrix A and B in the tcgen05.mma.{sp}.block_scale Op. The following are supported types for each kind: + + ``` + +--------------+-------------------------------------------+ + | Matrix Kind | supported types for A / B | + +--------------+-------------------------------------------+ + | mxf8f6f4 | e4m3, e5m3, e2m3, e3m2, e2m1 | + | mxf4 | e2m1 | + | mxf4nvf4 | e2m1 | + +--------------+-------------------------------------------+ + ``` + }]; + let assemblyFormat = "`<` $value `>`"; +} + +def Tcgen05MMABlockScaleDefault : I32EnumAttrCase<"DEFAULT", 0, "default">; +def Tcgen05MMABlockScaleBlock16 : I32EnumAttrCase<"BLOCK16", 1, "block16">; +def Tcgen05MMABlockScaleBlock32 : I32EnumAttrCase<"BLOCK32", 2, "block32">; + +def Tcgen05MMABlockScale + : I32EnumAttr<"Tcgen05MMABlockScale", + "tcgen05.mma block scale attribute", + [Tcgen05MMABlockScaleDefault, Tcgen05MMABlockScaleBlock16, + Tcgen05MMABlockScaleBlock32]> { + let cppNamespace = "::mlir::NVVM"; + let genSpecializedAttr = 0; +} + +def Tcgen05MMABlockScaleAttr : EnumAttr { + let assemblyFormat = "`<` $value `>`"; +} + +def NVVM_Tcgen05MMABlockScaleOp : NVVM_Op<"tcgen05.mma.block_scale", + [NVVMRequiresSMa<[100, 110]>]> { + let summary = "Performs block scaled MMA operation on 5th-gen tensor cores"; + + let description = [{ + The `tcgen05.mma.block_scale` operation is an asynchronous tensor core instruction + that performs matrix multiplication, accumulation with block scaling in a + single fused operation. It targets 5th-generation tensor cores, providing + developers with fine-grained control over execution and scheduling. + + ``` + D = (A * scale_a) * (B * scale_b)` // if `enableInputD` is false + D = (A * scale_a) * (B * scale_b) + D` + ``` + + where: + - A is an M x (K / 2) matrix in tensor memory or described using shared memory descriptor + - B is a K x N matrix described using shared memory descriptor + - D is an M x N accumulator matrix in tensor memory + - `scale_a` and `scale_b` are matrices in tensor memory used to scale `A` and `B` respectively + + The `shared memory descriptor` can be generated using `tcgen05.mma_smem_desc` Op + + - `idesc` is a 32 bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor) + + Required Attributes: + - `kind` is a Tcgen05MMABlockScaleKind attribute + + - `ctaGroup` specifies CTA group configuration + * cta_1: MMA will be performed on the current thread's CTA + * cta_2: MMA will be performed on the current thread and it's peer CTA + + Default Attributes: + - collectorOp is a Tcgen05MMACollectorOp attribute with matrix A as the collector buffer + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma) + }]; + + let arguments = (ins + Tcgen05MMABlockScaleKindAttr:$kind, + CTAGroupKindAttr:$ctaGroup, + DefaultValuedAttr:$blockScale, + DefaultValuedAttr:$collectorOp, + LLVM_PointerTensor:$matrixD, + AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA, + I64:$matrixB, + I32:$idesc, I1:$enableInputD, + LLVM_PointerTensor:$scaleA, + LLVM_PointerTensor:$scaleB + ); + + let assemblyFormat = [{ + $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD `,` $scaleA `,` $scaleB + attr-dict `:` `(` type(operands) `)` + }]; + + let hasVerifier = true; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder); + }]; + + let llvmBuilder = [{ + auto [ID, args] = NVVM::Tcgen05MMABlockScaleOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, ID, args); + }]; +} + +def NVVM_Tcgen05MMASparseBlockScaleOp : NVVM_Op<"tcgen05.mma.sp.block_scale", + [NVVMRequiresSMa<[100, 110]>]> { + let summary = "Performs block scaled MMA operation with sparse A matrix on 5th-gen tensor cores"; + + let description = [{ + The `tcgen05.mma.sp.block_scale` operation is an asynchronous tensor core + instruction that performs matrix multiplication, accumulation with block + scaling, and sparse `A` matrix in a single fused operation. It targets + 5th-generation tensor cores, providing developers with fine-grained control + over execution, and scheduling. + + ``` + D = (A * scale_a) * (B * scale_b) // if `enableInputD` is specified + D = (A * scale_a) * (B * scale_b) + D // otherwise + ``` + + where: + - A is an M x (K / 2) matrix in tensor memory or described using shared memory descriptor + - B is a K x N matrix described using shared memory descriptor + - D is an M x N accumulator matrix in tensor memory + - `scale_a` and `scale_b` are matrices in tensor memory used to scale `A` and `B` respectively + + Other attributes and operands are similar to that of tcgen05.mma.block_scale Op + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-sp) + }]; + + let arguments = (ins + Tcgen05MMABlockScaleKindAttr:$kind, + CTAGroupKindAttr:$ctaGroup, + DefaultValuedAttr:$blockScale, + DefaultValuedAttr:$collectorOp, + LLVM_PointerTensor:$matrixD, + AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA, + I64:$matrixB, + I32:$idesc, + I1:$enableInputD, + LLVM_PointerTensor:$sparseMetadata, + LLVM_PointerTensor:$scaleA, + LLVM_PointerTensor:$scaleB + ); + + let assemblyFormat = [{ + $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD `,` $sparseMetadata `,` $scaleA `,` $scaleB + attr-dict `:` `(` type(operands) `)` + }]; + + let hasVerifier = true; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder); + }]; + + let llvmBuilder = [{ + auto [ID, args] = NVVM::Tcgen05MMASparseBlockScaleOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, ID, args); + }]; +} + +def Tcgen05MMACollectorBBuffer0 : I32EnumAttrCase<"B0", 0, "b0">; +def Tcgen05MMACollectorBBuffer1 : I32EnumAttrCase<"B1", 1, "b1">; +def Tcgen05MMACollectorBBuffer2 : I32EnumAttrCase<"B2", 2, "b2">; +def Tcgen05MMACollectorBBuffer3 : I32EnumAttrCase<"B3", 3, "b3">; + +def Tcgen05MMACollectorBBuffer : I32EnumAttr< + "Tcgen05MMACollectorBBuffer", + "tcgen05 MMA Collector Buffer B Attribute", + [Tcgen05MMACollectorBBuffer0, Tcgen05MMACollectorBBuffer1, Tcgen05MMACollectorBBuffer2, + Tcgen05MMACollectorBBuffer3]> { + let cppNamespace = "::mlir::NVVM"; + let genSpecializedAttr = 0; +} + +def Tcgen05MMACollectorBBufferAttr : EnumAttr { + let assemblyFormat = "`<` $value `>`"; +} + +def NVVM_Tcgen05MMAWsOp : NVVM_Op<"tcgen05.mma.ws", + [NVVMRequiresSMa<[100, 110]>]> { + let summary = "Performs weight stationary convolution MMA operation on 5th-gen tensor cores"; + + let description = [{ + The `tcgen05.mma.ws` operation is an asynchronous tensor core instruction + that performs weight stationary convolution matrix multiplication, accumulation + in a single fused operation. It targets 5th-generation tensor cores, providing + developers with fine-grained control over execution, and scheduling. + + ``` + D = A * B` // if `enableInputD` is false + D = A * B + D` // otherwise + ``` + + where: + - A is an `M x K` matrix in tensor memory or described using shared memory descriptor + - B is a `K x N` matrix described using shared memory descriptor + - D is an `M x N` accumulator matrix in tensor memory + + The `shared memory descriptor` can be generated using `tcgen05.mma_smem_desc` Op + + - idesc is a 32-bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor) + + Optional Operands: + - zeroColMask is a 64 bit value representing the [Zero-column mask descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-zero-column-mask-descriptor) + + Required Attributes: + - `kind` is a Tcgen05MMAKind attribute + + Default Valued Attributes: + - collectorBBuffer specifies collector buffer for matrix B: b0 (default), b1, b2, b3 + + - collectorOp is a Tcgen05MMACollectorOp attribute with matrix B as the collector buffer + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-ws) + }]; + + let arguments = (ins + Tcgen05MMAKindAttr:$kind, + DefaultValuedAttr:$collectorBBuffer, + DefaultValuedAttr:$collectorOp, + LLVM_PointerTensor:$matrixD, + AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA, + I64:$matrixB, + I32:$idesc, + I1:$enableInputD, + Optional:$zeroColMask + ); + + let assemblyFormat = [{ + $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD (`,` $zeroColMask^)? + attr-dict `:` `(` type(operands) `)` + }]; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder); + }]; + + let llvmBuilder = [{ + auto [ID, args] = + NVVM::Tcgen05MMAWsOp::getIntrinsicIDAndArgs(*op, moduleTranslation, builder); + createIntrinsicCall(builder, ID, args); + }]; +} + +def NVVM_Tcgen05MMAWsSparseOp : NVVM_Op<"tcgen05.mma.ws.sp", + [NVVMRequiresSMa<[100, 110]>]> { + let summary = "Performs weight stationary convolution MMA with sparse A matrix on 5th-gen tensor cores"; + + let description = [{ + The `tcgen05.mma.ws.sp` operation is an asynchronous tensor core instruction + that performs weight stationary convolution matrix multiplication, accumulation + with sparse `A` matrix in a single fused operation. It targets 5th-generation + tensor cores, providing developers with fine-grained control over execution, + and scheduling. + + ``` + D = A * B` // if `enableInputD` is false + D = A * B + D` // otherwise + ``` + + where: + - A is an M x (K / 2) matrix in memory or descriptor format + - B is a K x N matrix + - D is an M x N accumulator matrix + - sparseMetadata located in tensor memory specifies the mapping of the `K / 2` + non-zero elements to the K elements before performing the MMA operation + + Other attributes and operands are similar to that of tcgen05.mma.ws Op + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-ws-sp) + }]; + + let arguments = (ins + Tcgen05MMAKindAttr:$kind, + DefaultValuedAttr:$collectorBBuffer, + DefaultValuedAttr:$collectorOp, + LLVM_PointerTensor:$matrixD, + AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA, + I64:$matrixB, + I32:$idesc, + I1:$enableInputD, + LLVM_PointerTensor:$sparseMetadata, + Optional:$zeroColMask + ); + + let assemblyFormat = [{ + $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD `,` $sparseMetadata (`,` $zeroColMask^)? attr-dict `:` `(` type(operands) `)` + }]; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder); + }]; + + let llvmBuilder = [{ + auto [ID, args] = NVVM::Tcgen05MMAWsSparseOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, ID, args); + }]; +} + //===----------------------------------------------------------------------===// // NVVM target attribute. //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp index d64c4d64cad84..5848489274c13 100644 --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -419,7 +419,10 @@ struct LowerGpuOpsToNVVMOpsPass final if (this->hasRedux) populateGpuSubgroupReduceOpLoweringPattern(converter, llvmPatterns); configureGpuToNVVMConversionLegality(target); - if (failed(applyPartialConversion(m, target, std::move(llvmPatterns)))) + ConversionConfig config; + config.allowPatternRollback = allowPatternRollback; + if (failed( + applyPartialConversion(m, target, std::move(llvmPatterns), config))) signalPassFailure(); } }; diff --git a/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp b/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp index 37cfc9f2c23e6..03842cc9bd3a0 100644 --- a/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp +++ b/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp @@ -36,6 +36,7 @@ namespace { struct SCFToControlFlowPass : public impl::SCFToControlFlowPassBase { + using Base::Base; void runOnOperation() override; }; @@ -736,7 +737,9 @@ void SCFToControlFlowPass::runOnOperation() { target.addIllegalOp(); target.markUnknownOpDynamicallyLegal([](Operation *) { return true; }); - if (failed( - applyPartialConversion(getOperation(), target, std::move(patterns)))) + ConversionConfig config; + config.allowPatternRollback = allowPatternRollback; + if (failed(applyPartialConversion(getOperation(), target, std::move(patterns), + config))) signalPassFailure(); } diff --git a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp index 50fca564b5b64..02b61bd989368 100644 --- a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp +++ b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp @@ -1520,20 +1520,12 @@ class TanPattern : public SPIRVToLLVMConversion { if (!dstType) return rewriter.notifyMatchFailure(tanOp, "type conversion failed"); - Location loc = tanOp.getLoc(); - Value sin = LLVM::SinOp::create(rewriter, loc, dstType, tanOp.getOperand()); - Value cos = LLVM::CosOp::create(rewriter, loc, dstType, tanOp.getOperand()); - rewriter.replaceOpWithNewOp(tanOp, dstType, sin, cos); + rewriter.replaceOpWithNewOp(tanOp, dstType, + adaptor.getOperands()); return success(); } }; -/// Convert `spirv.Tanh` to -/// -/// exp(2x) - 1 -/// ----------- -/// exp(2x) + 1 -/// class TanhPattern : public SPIRVToLLVMConversion { public: using SPIRVToLLVMConversion::SPIRVToLLVMConversion; @@ -1546,18 +1538,8 @@ class TanhPattern : public SPIRVToLLVMConversion { if (!dstType) return rewriter.notifyMatchFailure(tanhOp, "type conversion failed"); - Location loc = tanhOp.getLoc(); - Value two = createFPConstant(loc, srcType, dstType, rewriter, 2.0); - Value multiplied = - LLVM::FMulOp::create(rewriter, loc, dstType, two, tanhOp.getOperand()); - Value exponential = LLVM::ExpOp::create(rewriter, loc, dstType, multiplied); - Value one = createFPConstant(loc, srcType, dstType, rewriter, 1.0); - Value numerator = - LLVM::FSubOp::create(rewriter, loc, dstType, exponential, one); - Value denominator = - LLVM::FAddOp::create(rewriter, loc, dstType, exponential, one); - rewriter.replaceOpWithNewOp(tanhOp, dstType, numerator, - denominator); + rewriter.replaceOpWithNewOp(tanhOp, dstType, + adaptor.getOperands()); return success(); } }; diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp index e0cf353da207f..9b11270e7bbe2 100644 --- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp +++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp @@ -680,16 +680,6 @@ bool AnalysisState::hasUndefinedContents(OpOperand *opOperand) const { return false; } -// bufferization.to_buffer is not allowed to change the rank. -static void ensureToBufferOpIsValid(Value tensor, Type memrefType) { -#ifndef NDEBUG - auto rankedTensorType = llvm::dyn_cast(tensor.getType()); - assert((!rankedTensorType || llvm::cast(memrefType).getRank() == - rankedTensorType.getRank()) && - "to_buffer would be invalid: mismatching ranks"); -#endif -} - FailureOr bufferization::getBuffer(RewriterBase &rewriter, Value value, const BufferizationOptions &options, const BufferizationState &state) { @@ -708,7 +698,7 @@ FailureOr bufferization::getBuffer(RewriterBase &rewriter, Value value, FailureOr bufferType = getBufferType(value, options, state); if (failed(bufferType)) return failure(); - ensureToBufferOpIsValid(value, *bufferType); + return bufferization::ToBufferOp::create(rewriter, value.getLoc(), *bufferType, value) .getResult(); diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationDialect.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationDialect.cpp index d6c3cd62ee742..bd177ba1afccd 100644 --- a/mlir/lib/Dialect/Bufferization/IR/BufferizationDialect.cpp +++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationDialect.cpp @@ -54,9 +54,6 @@ struct BuiltinTensorExternalModel mlir::LogicalResult verifyCompatibleBufferType( mlir::Type tensor, BufferLikeType bufferType, llvm::function_ref emitError) const { - assert(isa(tensor) && "expected tensor type"); - assert(isa(bufferType) && "expected memref type"); - auto tensorType = cast(tensor); auto memrefType = cast(bufferType); diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp index 2c3e4661d266a..5462cddd44718 100644 --- a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp +++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp @@ -72,6 +72,7 @@ void buildGpuPassPipeline(OpPassManager &pm, ConvertGpuOpsToNVVMOpsOptions opt; opt.useBarePtrCallConv = options.kernelUseBarePtrCallConv; opt.indexBitwidth = options.indexBitWidth; + opt.allowPatternRollback = options.allowPatternRollback; pm.addNestedPass(createConvertGpuOpsToNVVMOps(opt)); pm.addNestedPass(createCanonicalizerPass()); pm.addNestedPass(createCSEPass()); diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index 7ac427dbe3941..369305b40c689 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -31,6 +31,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/NVVMIntrinsicUtils.h" #include "llvm/Support/Casting.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/NVPTXAddrSpace.h" @@ -60,6 +61,18 @@ static bool isPtrInSharedCTASpace(mlir::Value ptr) { return isPtrInAddrSpace(ptr, NVVMMemorySpace::Shared); } +// Helper method to convert CtaGroupKind in NVVM Dialect to CtaGroupKind in LLVM +static constexpr llvm::nvvm::CTAGroupKind +getNVVMCtaGroupKind(NVVM::CTAGroupKind ctaGroup) { + switch (ctaGroup) { + case NVVM::CTAGroupKind::CTA_1: + return llvm::nvvm::CTAGroupKind::CG_1; + case NVVM::CTAGroupKind::CTA_2: + return llvm::nvvm::CTAGroupKind::CG_2; + } + llvm_unreachable("unsupported cta_group value"); +} + //===----------------------------------------------------------------------===// // Verifier methods //===----------------------------------------------------------------------===// @@ -3091,6 +3104,605 @@ NVVM::IDArgPair ClusterLaunchControlQueryCancelOp::getIntrinsicIDAndArgs( return {intrinsicID, args}; } +//===----------------------------------------------------------------------===// +// NVVM tcgen05.mma functions +//===----------------------------------------------------------------------===// + +mlir::NVVM::IDArgPair +Tcgen05MMAOp::getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder) { + + auto thisOp = cast(op); + llvm::SmallVector args; + + args.push_back(mt.lookupValue(thisOp.getMatrixD())); + + llvm::Value *A = mt.lookupValue(thisOp.getMatrixA()); + const bool isATensor = isa(A->getType()); + args.push_back(A); + + args.push_back(mt.lookupValue(thisOp.getMatrixB())); + args.push_back(mt.lookupValue(thisOp.getIdesc())); + args.push_back(mt.lookupValue(thisOp.getEnableInputD())); + + using EnableAShiftArray = std::array; + using CtaGroupArray = std::array; + using IsATensorArray = std::array; + using HasScaleInputDArray = std::array; + using HasDisableOutputLaneArray = std::array; + + // [hasDisableOutputLane][hasScaleInputD][isATensor][CtaGroup][EnableAShift] + static constexpr HasDisableOutputLaneArray tcgen05MMAIDs = { + { // without diable output lane + {{// without scale input D + {{ + // shared + {{// cg1 + {llvm::Intrinsic::nvvm_tcgen05_mma_shared, notIntrinsic}, + // cg2 + {llvm::Intrinsic::nvvm_tcgen05_mma_shared, notIntrinsic}}}, + {{// tensor + { + // cg1 + llvm::Intrinsic::nvvm_tcgen05_mma_tensor, + llvm::Intrinsic::nvvm_tcgen05_mma_tensor_ashift, + }, + { + // cg2 + llvm::Intrinsic::nvvm_tcgen05_mma_tensor, + llvm::Intrinsic::nvvm_tcgen05_mma_tensor_ashift, + }}}, + }}, + // with scale input D + {{ // shared + {{// cg1 + {llvm::Intrinsic::nvvm_tcgen05_mma_shared_scale_d, notIntrinsic}, + // cg2 + {llvm::Intrinsic::nvvm_tcgen05_mma_shared_scale_d, notIntrinsic}}}, + {{// tensor + { + // cg1 + llvm::Intrinsic::nvvm_tcgen05_mma_tensor_scale_d, + llvm::Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_ashift, + }, + { + // cg2 + llvm::Intrinsic::nvvm_tcgen05_mma_tensor_scale_d, + llvm::Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_ashift, + }}}}}}}, + // with disable output lane + {{ // without scale input D + {{ // shared + {{// cg1 + {llvm::Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1, + notIntrinsic}, + // cg2 + {llvm::Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2, + notIntrinsic}}}, + {{// cg1 + { + llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_disable_output_lane_cg1, + llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift, + }, + // cg2 + { + llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_disable_output_lane_cg2, + llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift, + }}}}}, + // with scale input D + {{ // shared + {{// cg1 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1, + notIntrinsic}, + // cg2 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2, + notIntrinsic}}}, + // tensor + {{// cg1 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1, + llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift}, + // cg2 + { + llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2, + llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift, + }}}}}}}}}; + + llvm::Value *ScaleInputD = mt.lookupValue(thisOp.getScaleInputD()); + bool hasScaleInputD = ScaleInputD != nullptr; + + llvm::Value *DisableOutputLane = + mt.lookupValue(thisOp.getDisableOutputLane()); + bool hasDisableOutputLane = DisableOutputLane != nullptr; + + const unsigned ctaGroup = + static_cast(getNVVMCtaGroupKind(thisOp.getCtaGroup())); + + llvm::Intrinsic::ID ID = + tcgen05MMAIDs[hasDisableOutputLane][hasScaleInputD][isATensor] + [ctaGroup - 1][thisOp.getAShift()]; + + assert(ID != notIntrinsic && "Invalid intrinsic for Tcgen05MMAOp."); + + if (hasScaleInputD) + args.push_back(ScaleInputD); + + if (hasDisableOutputLane) + args.push_back(DisableOutputLane); + + args.push_back(builder.getInt32(static_cast(thisOp.getKind()))); + + if (!hasDisableOutputLane) + args.push_back(builder.getInt32(ctaGroup)); + + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorOp()))); + + return {ID, args}; +} + +static LogicalResult +verifyTcgen05MMAOp(bool isATensor, mlir::Value disableOutputLane, + NVVM::CTAGroupKind ctaGroup, bool hasAShift, + NVVM::Tcgen05MMACollectorOp collectorOp, Location loc) { + + if (disableOutputLane) { + mlir::VectorType disableOutputLaneType = + cast(disableOutputLane.getType()); + if ((ctaGroup == NVVM::CTAGroupKind::CTA_1 && + disableOutputLaneType.getNumElements() != 4) || + (ctaGroup == NVVM::CTAGroupKind::CTA_2 && + disableOutputLaneType.getNumElements() != 8)) + return emitError(loc) << "Disable Output Lane of length " + << disableOutputLaneType.getNumElements() + << " is incompatible with CtaGroupAttr"; + } + + if (hasAShift && !isATensor) + return emitError( + loc, "A-shift can be applied only when matrix A is in tensor memory"); + + if (hasAShift == true && (collectorOp == Tcgen05MMACollectorOp::FILL || + collectorOp == Tcgen05MMACollectorOp::USE)) + return emitError( + loc, "Cannot use collector buffer operation fill or use with ashift"); + + return success(); +} + +LogicalResult Tcgen05MMAOp::verify() { + return verifyTcgen05MMAOp(isa(getMatrixA().getType()), + getDisableOutputLane(), getCtaGroup(), getAShift(), + getCollectorOp(), getLoc()); +} + +//===----------------------------------------------------------------------===// +// NVVM tcgen05.mma.sp functions +//===----------------------------------------------------------------------===// + +mlir::NVVM::IDArgPair Tcgen05MMASparseOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + + auto thisOp = cast(op); + llvm::SmallVector args; + + args.push_back(mt.lookupValue(thisOp.getMatrixD())); + + llvm::Value *A = mt.lookupValue(thisOp.getMatrixA()); + bool isATensor = isa(A->getType()); + args.push_back(A); + + args.push_back(mt.lookupValue(thisOp.getMatrixB())); + args.push_back(mt.lookupValue(thisOp.getIdesc())); + args.push_back(mt.lookupValue(thisOp.getEnableInputD())); + args.push_back(mt.lookupValue(thisOp.getSparseMetadata())); + + using EnableAShiftArray = std::array; + using CtaGroupArray = std::array; + using IsATensorArray = std::array; + using HasScaleInputDArray = std::array; + using HasDisableOutputLaneArray = std::array; + + // [hasDisableOutputLane][hasScaleInputD][isATensor][CtaGroup][EnableAShift] + static constexpr HasDisableOutputLaneArray tcgen05MMASparseIDs = { + { // without diable output lane + {{// without scale input D + {{ + // shared + {{// cg1 + {llvm::Intrinsic::nvvm_tcgen05_mma_sp_shared, notIntrinsic}, + // cg2 + {llvm::Intrinsic::nvvm_tcgen05_mma_sp_shared, notIntrinsic}}}, + {{// tensor + { + // cg1 + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor, + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor_ashift, + }, + { + // cg2 + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor, + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor_ashift, + }}}, + }}, + // with scale input D + {{ // shared + {{// cg1 + {llvm::Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d, + notIntrinsic}, + // cg2 + {llvm::Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d, + notIntrinsic}}}, + {{// tensor + { + // cg1 + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d, + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_ashift, + }, + { + // cg2 + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d, + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_ashift, + }}}}}}}, + // with disable output lane + {{ // without scale input D + {{ // shared + {{// cg1 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1, + notIntrinsic}, + // cg2 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2, + notIntrinsic}}}, + {{// cg1 + { + llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1, + llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift, + }, + // cg2 + { + llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2, + llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift, + }}}}}, + // with scale input D + {{ // shared + {{// cg1 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1, + notIntrinsic}, + // cg2 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2, + notIntrinsic}}}, + // tensor + {{// cg1 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1, + llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift}, + // cg2 + { + llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2, + llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift, + }}}}}}}}}; + + llvm::Value *ScaleInputD = mt.lookupValue(thisOp.getScaleInputD()); + bool hasScaleInputD = ScaleInputD != nullptr; + + llvm::Value *DisableOutputLane = + mt.lookupValue(thisOp.getDisableOutputLane()); + bool hasDisableOutputLane = DisableOutputLane != nullptr; + + unsigned ctaGroup = + static_cast(getNVVMCtaGroupKind(thisOp.getCtaGroup())); + + llvm::Intrinsic::ID ID = + tcgen05MMASparseIDs[hasDisableOutputLane][hasScaleInputD][isATensor] + [ctaGroup - 1][thisOp.getAShift()]; + + assert(ID != notIntrinsic && "Invalid intrinsic for Tcgen05MMASparseOp."); + + if (hasScaleInputD) + args.push_back(ScaleInputD); + + if (hasDisableOutputLane) + args.push_back(DisableOutputLane); + + args.push_back(builder.getInt32(static_cast(thisOp.getKind()))); + + if (!hasDisableOutputLane) + args.push_back(builder.getInt32(ctaGroup)); + + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorOp()))); + + return {ID, args}; +} + +LogicalResult Tcgen05MMASparseOp::verify() { + return verifyTcgen05MMAOp(isa(getMatrixA().getType()), + getDisableOutputLane(), getCtaGroup(), getAShift(), + getCollectorOp(), getLoc()); +} + +//===----------------------------------------------------------------------===// +// NVVM tcgen05.mma.block_scale functions +//===----------------------------------------------------------------------===// + +mlir::NVVM::IDArgPair Tcgen05MMABlockScaleOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + + auto thisOp = cast(op); + llvm::SmallVector args; + + args.push_back(mt.lookupValue(thisOp.getMatrixD())); + + llvm::Value *A = mt.lookupValue(thisOp.getMatrixA()); + bool isATensor = isa(A->getType()); + args.push_back(A); + + args.push_back(mt.lookupValue(thisOp.getMatrixB())); + args.push_back(mt.lookupValue(thisOp.getIdesc())); + args.push_back(mt.lookupValue(thisOp.getEnableInputD())); + args.push_back(mt.lookupValue(thisOp.getScaleA())); + args.push_back(mt.lookupValue(thisOp.getScaleB())); + args.push_back(builder.getInt32( + static_cast(getNVVMCtaGroupKind(thisOp.getCtaGroup())))); + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorOp()))); + + auto kind = thisOp.getKind(); + auto blockScale = thisOp.getBlockScale(); + llvm::Intrinsic::ID ID = [&]() { + if (kind == NVVM::Tcgen05MMABlockScaleKind::MXF8F6F4) { + if (blockScale == NVVM::Tcgen05MMABlockScale::DEFAULT) { + return isATensor ? llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_mxf8f6f4_block_scale + : llvm::Intrinsic:: + nvvm_tcgen05_mma_shared_mxf8f6f4_block_scale; + } else if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK32) { + return isATensor + ? llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_mxf8f6f4_block_scale_block32 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_shared_mxf8f6f4_block_scale_block32; + } + } else if (kind == NVVM::Tcgen05MMABlockScaleKind::MXF4) { + if (blockScale == NVVM::Tcgen05MMABlockScale::DEFAULT) { + return isATensor + ? llvm::Intrinsic::nvvm_tcgen05_mma_tensor_mxf4_block_scale + : llvm::Intrinsic::nvvm_tcgen05_mma_shared_mxf4_block_scale; + } else if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK32) { + return isATensor ? llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_mxf4_block_scale_block32 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_shared_mxf4_block_scale_block32; + } + } else if (kind == NVVM::Tcgen05MMABlockScaleKind::MXF4NVF4) { + if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK32) { + return isATensor + ? llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_mxf4nvf4_block_scale_block32 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_shared_mxf4nvf4_block_scale_block32; + + } else if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK16) { + return isATensor + ? llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_mxf4nvf4_block_scale_block16 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_shared_mxf4nvf4_block_scale_block16; + } + } + llvm_unreachable("Invalid tcgen05.mma.block_scale attributes"); + }(); + + return {ID, args}; +} + +static LogicalResult +verifyTcgen05MMABlockScaleOp(NVVM::Tcgen05MMACollectorOp collectorOp, + NVVM::Tcgen05MMABlockScaleKind kind, + NVVM::Tcgen05MMABlockScale blockScale, + Location loc) { + + if (blockScale == NVVM::Tcgen05MMABlockScale::DEFAULT && + kind == Tcgen05MMABlockScaleKind::MXF4NVF4) + return emitError(loc, "mxf4nvf4 requires block scale attribute"); + + if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK16 && + kind != Tcgen05MMABlockScaleKind::MXF4NVF4) + return emitError(loc, + llvm::formatv("{} kind does not support block16 attribute", + stringifyEnum(kind))); + + return success(); +} + +LogicalResult Tcgen05MMABlockScaleOp::verify() { + return verifyTcgen05MMABlockScaleOp(getCollectorOp(), getKind(), + getBlockScale(), getLoc()); +} + +//===----------------------------------------------------------------------===// +// NVVM tcgen05.mma.sp.block_scale functions +//===----------------------------------------------------------------------===// + +mlir::NVVM::IDArgPair Tcgen05MMASparseBlockScaleOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + + auto thisOp = cast(op); + llvm::SmallVector args; + + args.push_back(mt.lookupValue(thisOp.getMatrixD())); + + llvm::Value *A = mt.lookupValue(thisOp.getMatrixA()); + bool isATensor = isa(A->getType()); + args.push_back(A); + + args.push_back(mt.lookupValue(thisOp.getMatrixB())); + args.push_back(mt.lookupValue(thisOp.getIdesc())); + args.push_back(mt.lookupValue(thisOp.getEnableInputD())); + args.push_back(mt.lookupValue(thisOp.getSparseMetadata())); + args.push_back(mt.lookupValue(thisOp.getScaleA())); + args.push_back(mt.lookupValue(thisOp.getScaleB())); + args.push_back(builder.getInt32( + static_cast(getNVVMCtaGroupKind(thisOp.getCtaGroup())))); + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorOp()))); + + auto kind = thisOp.getKind(); + auto blockScale = thisOp.getBlockScale(); + llvm::Intrinsic::ID ID = [&]() { + if (kind == NVVM::Tcgen05MMABlockScaleKind::MXF8F6F4) { + if (blockScale == NVVM::Tcgen05MMABlockScale::DEFAULT) { + return isATensor ? llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_mxf8f6f4_block_scale + : llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_mxf8f6f4_block_scale; + } else if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK32) { + return isATensor + ? llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_mxf8f6f4_block_scale_block32 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_mxf8f6f4_block_scale_block32; + } + } else if (kind == NVVM::Tcgen05MMABlockScaleKind::MXF4) { + if (blockScale == NVVM::Tcgen05MMABlockScale::DEFAULT) { + return isATensor ? llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_mxf4_block_scale + : llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_mxf4_block_scale; + } else if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK32) { + return isATensor + ? llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_mxf4_block_scale_block32 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_mxf4_block_scale_block32; + } + } else if (kind == NVVM::Tcgen05MMABlockScaleKind::MXF4NVF4) { + if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK32) { + return isATensor + ? llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_mxf4nvf4_block_scale_block32 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_mxf4nvf4_block_scale_block32; + + } else if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK16) { + return isATensor + ? llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_mxf4nvf4_block_scale_block16 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_mxf4nvf4_block_scale_block16; + } + } + llvm_unreachable("Invalid tcgen05.mma.sp.block_scale attributes"); + }(); + + return {ID, args}; +} + +LogicalResult Tcgen05MMASparseBlockScaleOp::verify() { + return verifyTcgen05MMABlockScaleOp(getCollectorOp(), getKind(), + getBlockScale(), getLoc()); +} + +//===----------------------------------------------------------------------===// +// NVVM tcgen05.mma.ws functions +//===----------------------------------------------------------------------===// + +mlir::NVVM::IDArgPair Tcgen05MMAWsOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + + auto thisOp = cast(op); + llvm::SmallVector args; + + args.push_back(mt.lookupValue(thisOp.getMatrixD())); + + llvm::Value *A = mt.lookupValue(thisOp.getMatrixA()); + bool isATensor = isa(A->getType()); + args.push_back(A); + + args.push_back(mt.lookupValue(thisOp.getMatrixB())); + args.push_back(mt.lookupValue(thisOp.getIdesc())); + args.push_back(mt.lookupValue(thisOp.getEnableInputD())); + + mlir::Value ZeroColMask = thisOp.getZeroColMask(); + llvm::Intrinsic::ID ID = notIntrinsic; + if (ZeroColMask) { + args.push_back(mt.lookupValue(ZeroColMask)); + ID = isATensor ? llvm::Intrinsic::nvvm_tcgen05_mma_ws_tensor_zero_col_mask + : llvm::Intrinsic::nvvm_tcgen05_mma_ws_shared_zero_col_mask; + } else + ID = isATensor ? llvm::Intrinsic::nvvm_tcgen05_mma_ws_tensor + : llvm::Intrinsic::nvvm_tcgen05_mma_ws_shared; + + args.push_back(builder.getInt32(static_cast(thisOp.getKind()))); + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorBBuffer()))); + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorOp()))); + + return {ID, args}; +} + +//===----------------------------------------------------------------------===// +// NVVM tcgen05.mma.ws.sp functions +//===----------------------------------------------------------------------===// + +mlir::NVVM::IDArgPair Tcgen05MMAWsSparseOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + + auto thisOp = cast(op); + llvm::SmallVector args; + + args.push_back(mt.lookupValue(thisOp.getMatrixD())); + + llvm::Value *A = mt.lookupValue(thisOp.getMatrixA()); + bool isATensor = isa(A->getType()); + args.push_back(A); + + args.push_back(mt.lookupValue(thisOp.getMatrixB())); + args.push_back(mt.lookupValue(thisOp.getIdesc())); + args.push_back(mt.lookupValue(thisOp.getEnableInputD())); + args.push_back(mt.lookupValue(thisOp.getSparseMetadata())); + + mlir::Value ZeroColMask = thisOp.getZeroColMask(); + llvm::Intrinsic::ID ID = notIntrinsic; + if (ZeroColMask) { + args.push_back(mt.lookupValue(ZeroColMask)); + ID = isATensor + ? llvm::Intrinsic::nvvm_tcgen05_mma_ws_sp_tensor_zero_col_mask + : llvm::Intrinsic::nvvm_tcgen05_mma_ws_sp_shared_zero_col_mask; + } else + ID = isATensor ? llvm::Intrinsic::nvvm_tcgen05_mma_ws_sp_tensor + : llvm::Intrinsic::nvvm_tcgen05_mma_ws_sp_shared; + + args.push_back(builder.getInt32(static_cast(thisOp.getKind()))); + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorBBuffer()))); + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorOp()))); + + return {ID, args}; +} + //===----------------------------------------------------------------------===// // NVVMDialect initialization, type parsing, and registration. //===----------------------------------------------------------------------===// diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir index a4b5dde8a2187..f1cc1eb983267 100644 --- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir +++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir @@ -1,4 +1,5 @@ // RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1' -split-input-file | FileCheck %s +// RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 allow-pattern-rollback=0' -split-input-file | FileCheck %s // RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 allowed-dialects=func,arith,cf' -split-input-file | FileCheck %s // RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 use-bare-ptr-memref-call-conv=1' -split-input-file | FileCheck %s --check-prefix=CHECK-BARE // RUN: mlir-opt %s -transform-interpreter | FileCheck %s diff --git a/mlir/test/Conversion/GPUToNVVM/memref.mlir b/mlir/test/Conversion/GPUToNVVM/memref.mlir index e164ca9103dee..a4e8ead344114 100644 --- a/mlir/test/Conversion/GPUToNVVM/memref.mlir +++ b/mlir/test/Conversion/GPUToNVVM/memref.mlir @@ -1,4 +1,5 @@ // RUN: mlir-opt %s -convert-gpu-to-nvvm | FileCheck %s +// RUN: mlir-opt %s -convert-gpu-to-nvvm="allow-pattern-rollback=0" | FileCheck %s // RUN: mlir-opt %s -convert-gpu-to-nvvm='use-bare-ptr-memref-call-conv=1' \ // RUN: | FileCheck %s --check-prefix=BARE diff --git a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir index b479467efc208..82c02c1d6ee63 100644 --- a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir +++ b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir @@ -1,4 +1,5 @@ // RUN: mlir-opt --convert-gpu-to-nvvm --split-input-file %s | FileCheck %s +// RUN: mlir-opt --convert-gpu-to-nvvm="allow-pattern-rollback=0" --split-input-file %s | FileCheck %s // RUN: mlir-opt --convert-gpu-to-nvvm="index-bitwidth=32" --split-input-file %s | FileCheck --check-prefix=CHECK32 %s gpu.module @test_module { diff --git a/mlir/test/Conversion/SCFToControlFlow/convert-to-cfg.mlir b/mlir/test/Conversion/SCFToControlFlow/convert-to-cfg.mlir index 483c7b35c6ec8..0c4f20e8d1a04 100644 --- a/mlir/test/Conversion/SCFToControlFlow/convert-to-cfg.mlir +++ b/mlir/test/Conversion/SCFToControlFlow/convert-to-cfg.mlir @@ -1,4 +1,5 @@ // RUN: mlir-opt -allow-unregistered-dialect -convert-scf-to-cf -split-input-file %s | FileCheck %s +// RUN: mlir-opt -allow-unregistered-dialect -convert-scf-to-cf="allow-pattern-rollback=0" -split-input-file %s | FileCheck %s // CHECK-LABEL: func @simple_std_for_loop(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index) { // CHECK-NEXT: cf.br ^bb1(%{{.*}} : index) diff --git a/mlir/test/Conversion/SPIRVToLLVM/gl-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/gl-ops-to-llvm.mlir index e1936e2fd8abe..b17e1c40cb9a7 100644 --- a/mlir/test/Conversion/SPIRVToLLVM/gl-ops-to-llvm.mlir +++ b/mlir/test/Conversion/SPIRVToLLVM/gl-ops-to-llvm.mlir @@ -162,9 +162,7 @@ spirv.func @sqrt(%arg0: f32, %arg1: vector<3xf16>) "None" { // CHECK-LABEL: @tan spirv.func @tan(%arg0: f32) "None" { - // CHECK: %[[SIN:.*]] = llvm.intr.sin(%{{.*}}) : (f32) -> f32 - // CHECK: %[[COS:.*]] = llvm.intr.cos(%{{.*}}) : (f32) -> f32 - // CHECK: llvm.fdiv %[[SIN]], %[[COS]] : f32 + // CHECK: llvm.intr.tan(%{{.*}}) : (f32) -> f32 %0 = spirv.GL.Tan %arg0 : f32 spirv.Return } @@ -175,13 +173,7 @@ spirv.func @tan(%arg0: f32) "None" { // CHECK-LABEL: @tanh spirv.func @tanh(%arg0: f32) "None" { - // CHECK: %[[TWO:.*]] = llvm.mlir.constant(2.000000e+00 : f32) : f32 - // CHECK: %[[X2:.*]] = llvm.fmul %[[TWO]], %{{.*}} : f32 - // CHECK: %[[EXP:.*]] = llvm.intr.exp(%[[X2]]) : (f32) -> f32 - // CHECK: %[[ONE:.*]] = llvm.mlir.constant(1.000000e+00 : f32) : f32 - // CHECK: %[[T0:.*]] = llvm.fsub %[[EXP]], %[[ONE]] : f32 - // CHECK: %[[T1:.*]] = llvm.fadd %[[EXP]], %[[ONE]] : f32 - // CHECK: llvm.fdiv %[[T0]], %[[T1]] : f32 + // CHECK: llvm.intr.tanh(%{{.*}}) : (f32) -> f32 %0 = spirv.GL.Tanh %arg0 : f32 spirv.Return } diff --git a/mlir/test/Dialect/Bufferization/invalid.mlir b/mlir/test/Dialect/Bufferization/invalid.mlir index 2c8807b66de74..9884b040119d0 100644 --- a/mlir/test/Dialect/Bufferization/invalid.mlir +++ b/mlir/test/Dialect/Bufferization/invalid.mlir @@ -127,3 +127,63 @@ func.func @invalid_manual_deallocation() { // expected-error @below{{op attribute 'bufferization.manual_deallocation' can be used only on ops that have an allocation and/or free side effect}} arith.constant {bufferization.manual_deallocation} 0 : index } + +// ----- + +func.func @invalid_rank_to_buffer(%t: tensor<1x2x3x4xf32>) { + // expected-error @below{{'bufferization.to_buffer' op failed to verify that specified tensor and buffer types match}} + // expected-error @below{{shapes do not match}} + %b = bufferization.to_buffer %t + : tensor<1x2x3x4xf32> to memref<1x2x3xf32> + return +} + +// ----- + +func.func @invalid_rank_to_tensor(%b: memref<1x2x3xf32>) { + // expected-error @below{{'bufferization.to_tensor' op failed to verify that specified tensor and buffer types match}} + // expected-error @below{{shapes do not match}} + %t = bufferization.to_tensor %b + : memref<1x2x3xf32> to tensor<1x2x3x4xf32> + return +} + +// ----- + +func.func @invalid_shape_to_buffer(%t: tensor<1x2x3x4xf32>) { + // expected-error @below{{'bufferization.to_buffer' op failed to verify that specified tensor and buffer types match}} + // expected-error @below{{shapes do not match}} + %b = bufferization.to_buffer %t + : tensor<1x2x3x4xf32> to memref<1x2x4x3xf32> + return +} + +// ----- + +func.func @invalid_shape_to_tensor(%b: memref<1x2x4x3xf32>) { + // expected-error @below{{'bufferization.to_tensor' op failed to verify that specified tensor and buffer types match}} + // expected-error @below{{shapes do not match}} + %t = bufferization.to_tensor %b + : memref<1x2x4x3xf32> to tensor<1x2x3x4xf32> + return +} + +// ----- + +func.func @invalid_type_to_buffer(%t: tensor<1x2x3x4xf32>) { + // expected-error @below{{'bufferization.to_buffer' op failed to verify that specified tensor and buffer types match}} + // expected-error @below{{element types do not match}} + %b = bufferization.to_buffer %t + : tensor<1x2x3x4xf32> to memref<1x2x3x4xf16> + return +} + +// ----- + +func.func @invalid_type_to_tensor(%b: memref<1x2x3x4xf16>) { + // expected-error @below{{'bufferization.to_tensor' op failed to verify that specified tensor and buffer types match}} + // expected-error @below{{element types do not match}} + %t2 = bufferization.to_tensor %b + : memref<1x2x3x4xf16> to tensor<1x2x3x4xf32> + return +} diff --git a/mlir/test/Dialect/Bufferization/ops.mlir b/mlir/test/Dialect/Bufferization/ops.mlir index fc6df4a09f706..b0db1bb2d0389 100644 --- a/mlir/test/Dialect/Bufferization/ops.mlir +++ b/mlir/test/Dialect/Bufferization/ops.mlir @@ -83,3 +83,40 @@ func.func @test_dealloc_op(%arg0: memref<2xf32>, %arg1: memref<4xi32>, bufferization.dealloc return %0#0, %0#1 : i1, i1 } + +// CHECK: func.func @test_builtin_custom_builtin_type_conversion +// CHECK-SAME: (%[[t:.*]]: tensor<42xf32>) -> tensor<42xf32> +func.func @test_builtin_custom_builtin_type_conversion(%t: tensor<42xf32>) + -> tensor<42xf32> { + // CHECK: %[[buffer:.*]] = bufferization.to_buffer %[[t]] + // CHECK-SAME: to !test.test_memref<[42], f32> + %buffer = bufferization.to_buffer %t + : tensor<42xf32> to !test.test_memref<[42], f32> + + // CHECK: %[[tensor:.*]] = bufferization.to_tensor %[[buffer]] + // CHECK-SAME: to tensor<42xf32> + %tensor = bufferization.to_tensor %buffer + : !test.test_memref<[42], f32> to tensor<42xf32> + + // CHECK: return %[[tensor]] + return %tensor : tensor<42xf32> +} + +// CHECK: func.func @test_custom_builtin_custom_type_conversion +// CHECK-SAME: (%[[t:.*]]: !test.test_tensor<[42], f32>) +// CHECK-SAME: -> !test.test_tensor<[42], f32> +func.func @test_custom_builtin_custom_type_conversion(%t: !test.test_tensor<[42], f32>) + -> !test.test_tensor<[42], f32> { + // CHECK: %[[buffer:.*]] = bufferization.to_buffer %[[t]] + // CHECK-SAME: to memref<42xf32> + %buffer = bufferization.to_buffer %t + : !test.test_tensor<[42], f32> to memref<42xf32> + + // CHECK: %[[tensor:.*]] = bufferization.to_tensor %[[buffer]] + // CHECK-SAME: to !test.test_tensor<[42], f32> + %tensor = bufferization.to_tensor %buffer + : memref<42xf32> to !test.test_tensor<[42], f32> + + // CHECK: return %[[tensor]] + return %tensor : !test.test_tensor<[42], f32> +} diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir index 5585d98c25b82..d0001f6ffc376 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir index cd90ce3ba2f1a..fcff5f40a6cc7 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir index fec2567f47f15..4718ac94fa0f2 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir index d5633b00313b3..5e3a7e7e7d729 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir index db297b0fc27b7..f1a48ae0c19c5 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir index 65cbc79752177..f0a46cea7ceb9 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir index a0c955e4b570c..ddbabd4ddf960 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir b/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir index f041df82b4325..5c56e2ddfbd51 100644 --- a/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir +++ b/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/assert.mlir b/mlir/test/Integration/GPU/CUDA/assert.mlir index 71a21cf4bd620..83cf70cd17078 100644 --- a/mlir/test/Integration/GPU/CUDA/assert.mlir +++ b/mlir/test/Integration/GPU/CUDA/assert.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: mlir-opt %s -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir b/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir index 34dde6e03c80e..77a4fa089b62d 100644 --- a/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir +++ b/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 ptxas-cmd-options='-v --register-usage-level=8'" -debug-only=serialize-to-binary \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 ptxas-cmd-options='-v --register-usage-level=8' allow-pattern-rollback=0" -debug-only=serialize-to-binary \ // RUN: 2>&1 | FileCheck %s func.func @host_function(%arg0 : f32, %arg1 : memref) { diff --git a/mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir b/mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir index ed01416d9523a..51f6e36aaa977 100644 --- a/mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir +++ b/mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir @@ -2,7 +2,7 @@ // increment a global atomic counter and wait for the counter to reach 2. // // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | env CUDA_MODULE_LOADING=EAGER mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir b/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir index 27ec1ec435fef..efffcaaf23b2e 100644 --- a/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir +++ b/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline -debug-only=serialize-to-isa \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="allow-pattern-rollback=0" -debug-only=serialize-to-isa \ // RUN: 2>&1 | FileCheck %s // CHECK-LABEL: Generated by LLVM NVPTX Back-End diff --git a/mlir/test/Integration/GPU/CUDA/dump-sass.mlir b/mlir/test/Integration/GPU/CUDA/dump-sass.mlir index d32f5efc29d58..f810678569615 100644 --- a/mlir/test/Integration/GPU/CUDA/dump-sass.mlir +++ b/mlir/test/Integration/GPU/CUDA/dump-sass.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline -debug-only=dump-sass \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="allow-pattern-rollback=0" -debug-only=dump-sass \ // RUN: 2>&1 | FileCheck %s // CHECK: MOV diff --git a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir index 07f3218ae89b2..fe3c2b1d93a1b 100644 --- a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir +++ b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir index b2ac90acde94f..f8f1aa8aaa42e 100644 --- a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir +++ b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/printf.mlir b/mlir/test/Integration/GPU/CUDA/printf.mlir index fd664f2331488..ef116760b69e5 100644 --- a/mlir/test/Integration/GPU/CUDA/printf.mlir +++ b/mlir/test/Integration/GPU/CUDA/printf.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/shuffle.mlir b/mlir/test/Integration/GPU/CUDA/shuffle.mlir index a6207d64c038b..a4be5223cd792 100644 --- a/mlir/test/Integration/GPU/CUDA/shuffle.mlir +++ b/mlir/test/Integration/GPU/CUDA/shuffle.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/two-modules.mlir b/mlir/test/Integration/GPU/CUDA/two-modules.mlir index c3cee2fda46f3..3490003d6ba19 100644 --- a/mlir/test/Integration/GPU/CUDA/two-modules.mlir +++ b/mlir/test/Integration/GPU/CUDA/two-modules.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir index cc243c86ca902..0e087200b1116 100644 --- a/mlir/test/Target/LLVMIR/llvmir.mlir +++ b/mlir/test/Target/LLVMIR/llvmir.mlir @@ -2373,7 +2373,7 @@ llvm.func @readonly_function(%arg0: !llvm.ptr {llvm.readonly}) llvm.func @arg_mem_none_func() attributes { memory_effects = #llvm.memory_effects} -// CHECK: attributes #[[ATTR]] = { memory(readwrite, argmem: none, errnomem: none) } +// CHECK: attributes #[[ATTR]] = { memory(readwrite, argmem: none, errnomem: none, target_mem0: none, target_mem1: none) } // ----- @@ -2381,7 +2381,7 @@ llvm.func @arg_mem_none_func() attributes { llvm.func @readwrite_func() attributes { memory_effects = #llvm.memory_effects} -// CHECK: attributes #[[ATTR]] = { memory(readwrite, errnomem: none) } +// CHECK: attributes #[[ATTR]] = { memory(readwrite, errnomem: none, target_mem0: none, target_mem1: none) } // ----- @@ -2734,11 +2734,11 @@ llvm.func @mem_effects_call() { // CHECK: #[[ATTRS_0]] // CHECK-SAME: memory(none) // CHECK: #[[ATTRS_1]] -// CHECK-SAME: memory(read, argmem: none, inaccessiblemem: write, errnomem: none) +// CHECK-SAME: memory(read, argmem: none, inaccessiblemem: write, errnomem: none, target_mem0: none, target_mem1: none) // CHECK: #[[ATTRS_2]] -// CHECK-SAME: memory(read, inaccessiblemem: write, errnomem: none) +// CHECK-SAME: memory(read, inaccessiblemem: write, errnomem: none, target_mem0: none, target_mem1: none) // CHECK: #[[ATTRS_3]] -// CHECK-SAME: memory(readwrite, argmem: read, errnomem: none) +// CHECK-SAME: memory(readwrite, argmem: read, errnomem: none, target_mem0: none, target_mem1: none) // ----- diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-shared.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-shared.mlir new file mode 100644 index 0000000000000..db4574bfaf78f --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-shared.mlir @@ -0,0 +1,229 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_mxf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_mxf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-tensor.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-tensor.mlir new file mode 100644 index 0000000000000..a15c3fb73de9c --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-tensor.mlir @@ -0,0 +1,229 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_mxf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_mxf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-invalid.mlir new file mode 100644 index 0000000000000..f46b35a910fd9 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-invalid.mlir @@ -0,0 +1,119 @@ +// RUN: mlir-translate --mlir-to-llvmir -verify-diagnostics -split-input-file %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLanev4: vector<4 x i32>, %disableOutputLanev8: vector<8 x i32>) { + // expected-error @below {{Disable Output Lane of length 8 is incompatible with CtaGroupAttr}} + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLanev8 + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLanev4: vector<4 x i32>, %disableOutputLanev8: vector<8 x i32>) { + // expected-error @below {{Disable Output Lane of length 8 is incompatible with CtaGroupAttr}} + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLanev8 + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_shared_ashift +llvm.func @nvvm_tcgen05_mma_shared_ashift(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + // expected-error @below {{A-shift can be applied only when matrix A is in tensor memory}} + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, i64, i64, i32, i1) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_ashift +llvm.func @nvvm_tcgen05_mma_ashift(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + // expected-error @below {{Cannot use collector buffer operation fill or use with ashift}} + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4nvf4_block_scale_default +llvm.func @nvvm_tcgen05_mma_mxf4nvf4_block_scale_default(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scalea: !llvm.ptr<6>, %scaleb: !llvm.ptr<6>) { + // expected-error @below {{mxf4nvf4 requires block scale attribute}} + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scalea, %scaleb + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4_block_scale_default +llvm.func @nvvm_tcgen05_mma_mxf4_block_scale_default(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scalea: !llvm.ptr<6>, %scaleb: !llvm.ptr<6>) { + // expected-error @below {{mxf4 kind does not support block16 attribute}} + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scalea, %scaleb + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, ashift, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLanev4: vector<4 x i32>, %disableOutputLanev8: vector<8 x i32>, %spmetadata: !llvm.ptr<6>) { + // expected-error @below {{Disable Output Lane of length 8 is incompatible with CtaGroupAttr}} + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLanev8 + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLanev4: vector<4 x i32>, %disableOutputLanev8: vector<8 x i32>, %spmetadata: !llvm.ptr<6>) { + // expected-error @below {{Disable Output Lane of length 8 is incompatible with CtaGroupAttr}} + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLanev8 + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_sp_mma_shared_ashift +llvm.func @nvvm_tcgen05_sp_mma_shared_ashift(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + // expected-error @below {{A-shift can be applied only when matrix A is in tensor memory}} + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_ashift +llvm.func @nvvm_tcgen05_mma_sp_ashift(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + // expected-error @below {{Cannot use collector buffer operation fill or use with ashift}} + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_default +llvm.func @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_default(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scalea: !llvm.ptr<6>, %scaleb: !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + // expected-error @below {{mxf4nvf4 requires block scale attribute}} + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scalea, %scaleb + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4_block_scale_default +llvm.func @nvvm_tcgen05_mma_sp_mxf4_block_scale_default(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scalea: !llvm.ptr<6>, %scaleb: !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + // expected-error @below {{mxf4 kind does not support block16 attribute}} + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scalea, %scaleb + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, ashift, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-shared.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-shared.mlir new file mode 100644 index 0000000000000..286df36730e77 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-shared.mlir @@ -0,0 +1,442 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_cta_1 +llvm.func @nvvm_tcgen05_mma_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_cta_2 +llvm.func @nvvm_tcgen05_mma_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + llvm.return +} + + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_cta_1 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_cta_2 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane : vector<4 x i32>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<4 x i32>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-shared.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-shared.mlir new file mode 100644 index 0000000000000..5c7eabee71b4e --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-shared.mlir @@ -0,0 +1,229 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-tensor.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-tensor.mlir new file mode 100644 index 0000000000000..3200411aee213 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-tensor.mlir @@ -0,0 +1,229 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-shared.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-shared.mlir new file mode 100644 index 0000000000000..96044cf669d63 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-shared.mlir @@ -0,0 +1,442 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + llvm.return +} + + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane : vector<4 x i32>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<4 x i32>, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-tensor.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-tensor.mlir new file mode 100644 index 0000000000000..709beb0508bb8 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-tensor.mlir @@ -0,0 +1,634 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + llvm.return +} + + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane : vector<4 x i32>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<4 x i32>, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-tensor.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-tensor.mlir new file mode 100644 index 0000000000000..798e311778beb --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-tensor.mlir @@ -0,0 +1,633 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_cta_1 +llvm.func @nvvm_tcgen05_mma_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 1, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 1, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 1, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 1, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 1, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 1, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 1, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 1, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 1, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 1, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 1, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 1, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_cta_2 +llvm.func @nvvm_tcgen05_mma_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 2, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 2, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 2, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 2, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 2, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 2, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 2, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 2, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 2, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 2, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 2, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 2, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_cta_1 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_cta_2 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane : vector<4 x i32>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<4 x i32>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-shared.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-shared.mlir new file mode 100644 index 0000000000000..5f1aeb05888bd --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-shared.mlir @@ -0,0 +1,133 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws +llvm.func @nvvm_tcgen05_mma_ws(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws_zero_col_mask +llvm.func @nvvm_tcgen05_mma_ws_zero_col_mask(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %zero_col_mask: i64) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-shared.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-shared.mlir new file mode 100644 index 0000000000000..e390e350090ad --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-shared.mlir @@ -0,0 +1,133 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws_sp +llvm.func @nvvm_tcgen05_mma_ws_sp(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws_sp_zero_col_mask +llvm.func @nvvm_tcgen05_mma_ws_sp_zero_col_mask(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>, %zero_col_mask: i64) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-tensor.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-tensor.mlir new file mode 100644 index 0000000000000..f7ce5484803e9 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-tensor.mlir @@ -0,0 +1,133 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws_sp +llvm.func @nvvm_tcgen05_mma_ws_sp(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws_sp_zero_col_mask +llvm.func @nvvm_tcgen05_mma_ws_sp_zero_col_mask(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>, %zero_col_mask: i64) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-tensor.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-tensor.mlir new file mode 100644 index 0000000000000..cecbb3fbd90af --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-tensor.mlir @@ -0,0 +1,133 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws +llvm.func @nvvm_tcgen05_mma_ws(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws_zero_col_mask +llvm.func @nvvm_tcgen05_mma_ws_zero_col_mask(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %zero_col_mask: i64) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + llvm.return +} diff --git a/mlir/test/lib/Dialect/Test/TestTypes.cpp b/mlir/test/lib/Dialect/Test/TestTypes.cpp index 614121f1d43dd..9cf64a896d28a 100644 --- a/mlir/test/lib/Dialect/Test/TestTypes.cpp +++ b/mlir/test/lib/Dialect/Test/TestTypes.cpp @@ -569,11 +569,17 @@ TestTensorType::getBufferType( ::mlir::LogicalResult TestTensorType::verifyCompatibleBufferType( ::mlir::bufferization::BufferLikeType bufferType, ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError) { - auto testMemref = dyn_cast(bufferType); - if (!testMemref) - return emitError() << "expected TestMemrefType"; + if (auto testMemref = dyn_cast(bufferType)) { + const bool valid = getShape() == testMemref.getShape() && + getElementType() == testMemref.getElementType(); + return mlir::success(valid); + } + + if (auto builtinMemref = dyn_cast(bufferType)) { + const bool valid = getShape() == builtinMemref.getShape() && + getElementType() == builtinMemref.getElementType(); + return mlir::success(valid); + } - const bool valid = getShape() == testMemref.getShape() && - getElementType() == testMemref.getElementType(); - return mlir::success(valid); + return emitError() << "expected MemRefType or TestMemrefType"; }