From 951ab04d6cdde7fe32e4fbe82ea5384378717765 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Tue, 18 Nov 2025 13:47:28 +0800 Subject: [PATCH 01/35] [mlir][NVVM] Add no-rollback option to NVVM lowering passes (#168477) Add pass options to run lowerings to NVVM without pattern rollback. This makes the dialect conversions easier to debug and improves performance/memory usage. --- mlir/include/mlir/Conversion/Passes.td | 2 ++ mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h | 4 ++++ mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp | 5 ++++- mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp | 1 + mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir | 1 + mlir/test/Conversion/GPUToNVVM/memref.mlir | 1 + mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir | 1 + mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir | 2 +- mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir | 2 +- mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir | 2 +- mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir | 2 +- mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir | 2 +- mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir | 2 +- mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir | 2 +- mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir | 2 +- mlir/test/Integration/GPU/CUDA/assert.mlir | 2 +- mlir/test/Integration/GPU/CUDA/command-line-arg.mlir | 2 +- mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir | 2 +- mlir/test/Integration/GPU/CUDA/dump-ptx.mlir | 2 +- mlir/test/Integration/GPU/CUDA/dump-sass.mlir | 2 +- mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir | 2 +- mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir | 2 +- mlir/test/Integration/GPU/CUDA/printf.mlir | 2 +- mlir/test/Integration/GPU/CUDA/shuffle.mlir | 2 +- mlir/test/Integration/GPU/CUDA/two-modules.mlir | 2 +- 25 files changed, 32 insertions(+), 19 deletions(-) diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index 79bc380dbcb7a..0164a2fb9fa81 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -628,6 +628,8 @@ def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> { /*default=*/"false", "Replace memref arguments in GPU functions with bare pointers. " "All memrefs must have static shape.">, + Option<"allowPatternRollback", "allow-pattern-rollback", "bool", "true", + "Experimental performance flag to disallow pattern rollback">, ListOption<"allowedDialects", "allowed-dialects", "std::string", "Run conversion patterns of only the specified dialects">, ]; diff --git a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h index fccb49d49da70..34c85de3418ec 100644 --- a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h @@ -58,6 +58,10 @@ struct GPUToNVVMPipelineOptions "Whether to use the bareptr calling convention on the host (warning " "this should be false until the GPU layering is fixed)"), llvm::cl::init(false)}; + PassOptions::Option allowPatternRollback{ + *this, "allow-pattern-rollback", + llvm::cl::desc("Allow pattern rollback during dialect conversion"), + llvm::cl::init(true)}; }; // Options for the gpu to xevm pipeline. diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp index d64c4d64cad84..5848489274c13 100644 --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -419,7 +419,10 @@ struct LowerGpuOpsToNVVMOpsPass final if (this->hasRedux) populateGpuSubgroupReduceOpLoweringPattern(converter, llvmPatterns); configureGpuToNVVMConversionLegality(target); - if (failed(applyPartialConversion(m, target, std::move(llvmPatterns)))) + ConversionConfig config; + config.allowPatternRollback = allowPatternRollback; + if (failed( + applyPartialConversion(m, target, std::move(llvmPatterns), config))) signalPassFailure(); } }; diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp index 2c3e4661d266a..5462cddd44718 100644 --- a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp +++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp @@ -72,6 +72,7 @@ void buildGpuPassPipeline(OpPassManager &pm, ConvertGpuOpsToNVVMOpsOptions opt; opt.useBarePtrCallConv = options.kernelUseBarePtrCallConv; opt.indexBitwidth = options.indexBitWidth; + opt.allowPatternRollback = options.allowPatternRollback; pm.addNestedPass(createConvertGpuOpsToNVVMOps(opt)); pm.addNestedPass(createCanonicalizerPass()); pm.addNestedPass(createCSEPass()); diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir index a4b5dde8a2187..f1cc1eb983267 100644 --- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir +++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir @@ -1,4 +1,5 @@ // RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1' -split-input-file | FileCheck %s +// RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 allow-pattern-rollback=0' -split-input-file | FileCheck %s // RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 allowed-dialects=func,arith,cf' -split-input-file | FileCheck %s // RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 use-bare-ptr-memref-call-conv=1' -split-input-file | FileCheck %s --check-prefix=CHECK-BARE // RUN: mlir-opt %s -transform-interpreter | FileCheck %s diff --git a/mlir/test/Conversion/GPUToNVVM/memref.mlir b/mlir/test/Conversion/GPUToNVVM/memref.mlir index e164ca9103dee..a4e8ead344114 100644 --- a/mlir/test/Conversion/GPUToNVVM/memref.mlir +++ b/mlir/test/Conversion/GPUToNVVM/memref.mlir @@ -1,4 +1,5 @@ // RUN: mlir-opt %s -convert-gpu-to-nvvm | FileCheck %s +// RUN: mlir-opt %s -convert-gpu-to-nvvm="allow-pattern-rollback=0" | FileCheck %s // RUN: mlir-opt %s -convert-gpu-to-nvvm='use-bare-ptr-memref-call-conv=1' \ // RUN: | FileCheck %s --check-prefix=BARE diff --git a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir index b479467efc208..82c02c1d6ee63 100644 --- a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir +++ b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir @@ -1,4 +1,5 @@ // RUN: mlir-opt --convert-gpu-to-nvvm --split-input-file %s | FileCheck %s +// RUN: mlir-opt --convert-gpu-to-nvvm="allow-pattern-rollback=0" --split-input-file %s | FileCheck %s // RUN: mlir-opt --convert-gpu-to-nvvm="index-bitwidth=32" --split-input-file %s | FileCheck --check-prefix=CHECK32 %s gpu.module @test_module { diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir index 5585d98c25b82..d0001f6ffc376 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir index cd90ce3ba2f1a..fcff5f40a6cc7 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir index fec2567f47f15..4718ac94fa0f2 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir index d5633b00313b3..5e3a7e7e7d729 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir index db297b0fc27b7..f1a48ae0c19c5 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir index 65cbc79752177..f0a46cea7ceb9 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir index a0c955e4b570c..ddbabd4ddf960 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir b/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir index f041df82b4325..5c56e2ddfbd51 100644 --- a/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir +++ b/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/assert.mlir b/mlir/test/Integration/GPU/CUDA/assert.mlir index 71a21cf4bd620..83cf70cd17078 100644 --- a/mlir/test/Integration/GPU/CUDA/assert.mlir +++ b/mlir/test/Integration/GPU/CUDA/assert.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: mlir-opt %s -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir b/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir index 34dde6e03c80e..77a4fa089b62d 100644 --- a/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir +++ b/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 ptxas-cmd-options='-v --register-usage-level=8'" -debug-only=serialize-to-binary \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 ptxas-cmd-options='-v --register-usage-level=8' allow-pattern-rollback=0" -debug-only=serialize-to-binary \ // RUN: 2>&1 | FileCheck %s func.func @host_function(%arg0 : f32, %arg1 : memref) { diff --git a/mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir b/mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir index ed01416d9523a..51f6e36aaa977 100644 --- a/mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir +++ b/mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir @@ -2,7 +2,7 @@ // increment a global atomic counter and wait for the counter to reach 2. // // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | env CUDA_MODULE_LOADING=EAGER mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir b/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir index 27ec1ec435fef..efffcaaf23b2e 100644 --- a/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir +++ b/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline -debug-only=serialize-to-isa \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="allow-pattern-rollback=0" -debug-only=serialize-to-isa \ // RUN: 2>&1 | FileCheck %s // CHECK-LABEL: Generated by LLVM NVPTX Back-End diff --git a/mlir/test/Integration/GPU/CUDA/dump-sass.mlir b/mlir/test/Integration/GPU/CUDA/dump-sass.mlir index d32f5efc29d58..f810678569615 100644 --- a/mlir/test/Integration/GPU/CUDA/dump-sass.mlir +++ b/mlir/test/Integration/GPU/CUDA/dump-sass.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline -debug-only=dump-sass \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="allow-pattern-rollback=0" -debug-only=dump-sass \ // RUN: 2>&1 | FileCheck %s // CHECK: MOV diff --git a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir index 07f3218ae89b2..fe3c2b1d93a1b 100644 --- a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir +++ b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir index b2ac90acde94f..f8f1aa8aaa42e 100644 --- a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir +++ b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/printf.mlir b/mlir/test/Integration/GPU/CUDA/printf.mlir index fd664f2331488..ef116760b69e5 100644 --- a/mlir/test/Integration/GPU/CUDA/printf.mlir +++ b/mlir/test/Integration/GPU/CUDA/printf.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/shuffle.mlir b/mlir/test/Integration/GPU/CUDA/shuffle.mlir index a6207d64c038b..a4be5223cd792 100644 --- a/mlir/test/Integration/GPU/CUDA/shuffle.mlir +++ b/mlir/test/Integration/GPU/CUDA/shuffle.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/two-modules.mlir b/mlir/test/Integration/GPU/CUDA/two-modules.mlir index c3cee2fda46f3..3490003d6ba19 100644 --- a/mlir/test/Integration/GPU/CUDA/two-modules.mlir +++ b/mlir/test/Integration/GPU/CUDA/two-modules.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ From ea26d92c5317317464c7b37672520adbd8807054 Mon Sep 17 00:00:00 2001 From: Garth Lei Date: Tue, 18 Nov 2025 14:36:47 +0800 Subject: [PATCH 02/35] [RISCV] Remove unused argument check (NFC) (#168313) The index == 0 scenerio has already been handled by the early return, so only the upper half scenerio is relevant here. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index fb298ee35d6c2..921d12757d672 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2535,7 +2535,7 @@ bool RISCVTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, // TODO: For sizes which aren't multiples of VLEN sizes, this may not be // a cheap extract. However, this case is important in practice for // shuffled extracts of longer vectors. How resolve? - return (ResElts * 2) == SrcElts && (Index == 0 || Index == ResElts); + return (ResElts * 2) == SrcElts && Index == ResElts; } MVT RISCVTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, From 485b3af135dca84f5c81a07cd123b40d6f52e7a6 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Tue, 18 Nov 2025 14:38:55 +0800 Subject: [PATCH 03/35] [RISCV] Reduce minimum VL needed for vslidedown.vx in RISCVVLOptimizer (#168392) Whenever #149042 is relanded we will soon start EVL tail folding vectorized loops that have live-outs, e.g.: ```c int f(int *x, int n) { for (int i = 0; i < n; i++) { int y = x[i] + 1; x[y] = y; } return y; } ``` These are vectorized by extracting the last "active lane" in the loop's exit: ```llvm loop: %vl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true) ... exit: %lastidx = sub i64 %vl, 1 %lastelt = extractelement %y, i64 %lastidx ``` Which in RISC-V translates to a vslidedown.vx with a VL of 1: ```llvm bb.loop: %vl:gprnox0 = PseudoVSETVLI ... %y:vr = PseudoVADD_VI_M1 $noreg, %x, 1, AVL=-1 ... bb.exit: %lastidx:gprnox0 = ADDI %vl, -1 %w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %y, %lastidx, AVL=1 ``` However today we will fail to reduce the VL of %y in the loop and will end up with two extra VL toggles. The reason being that today RISCVVLOptimizer is conservative with vslidedown.vx as it can read the lanes of %y past its own VL. So in `getMinimumVLForUser` we say that vslidedown.vx demands the entirety of %y. One observation with the sequence above is that it only actually needs to read the first %vl lanes of %y, because the last lane of vs2 used is offset + 1. In this case, that's `%lastidx + 1 = %vl - 1 + 1 = %vl`. This PR teaches RISCVVLOptimizer about this case in `getMinimumVLForVSLIDEDOWN_VX`, and in doing so removes the VL toggles. The one case that I had to think about for a bit was whenever `ADDI %vl, -1` wraps, i.e. when %vl=0 and the resulting offset is all ones. This should always be larger than the largest VLMAX, so vs2 will be completely slid down and absent from the output. So we don't need to read anything from vs2. This patch on its own has no observable effect on llvm-test-suite or SPEC CPU 2017 w/ rva23u64 today. --- llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp | 42 +++++++++++++++++- .../test/CodeGen/RISCV/rvv/vl-opt-live-out.ll | 44 +++++++++++++++++++ llvm/test/CodeGen/RISCV/rvv/vl-opt.mir | 35 +++++++++++++++ 3 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/RISCV/rvv/vl-opt-live-out.ll diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index 0a8838cbd45c7..c742b92416362 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -62,7 +62,7 @@ struct DemandedVL { }; class RISCVVLOptimizer : public MachineFunctionPass { - const MachineRegisterInfo *MRI; + MachineRegisterInfo *MRI; const MachineDominatorTree *MDT; const TargetInstrInfo *TII; @@ -1392,6 +1392,42 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const { return true; } +/// Given a vslidedown.vx like: +/// +/// %slideamt = ADDI %x, -1 +/// %v = PseudoVSLIDEDOWN_VX %passthru, %src, %slideamt, avl=1 +/// +/// %v will only read the first %slideamt + 1 lanes of %src, which = %x. +/// This is a common case when lowering extractelement. +/// +/// Note that if %x is 0, %slideamt will be all ones. In this case %src will be +/// completely slid down and none of its lanes will be read (since %slideamt is +/// greater than the largest VLMAX of 65536) so we can demand any minimum VL. +static std::optional +getMinimumVLForVSLIDEDOWN_VX(const MachineOperand &UserOp, + const MachineRegisterInfo *MRI) { + const MachineInstr &MI = *UserOp.getParent(); + if (RISCV::getRVVMCOpcode(MI.getOpcode()) != RISCV::VSLIDEDOWN_VX) + return std::nullopt; + // We're looking at what lanes are used from the src operand. + if (UserOp.getOperandNo() != 2) + return std::nullopt; + // For now, the AVL must be 1. + const MachineOperand &AVL = MI.getOperand(4); + if (!AVL.isImm() || AVL.getImm() != 1) + return std::nullopt; + // The slide amount must be %x - 1. + const MachineOperand &SlideAmt = MI.getOperand(3); + if (!SlideAmt.getReg().isVirtual()) + return std::nullopt; + MachineInstr *SlideAmtDef = MRI->getUniqueVRegDef(SlideAmt.getReg()); + if (SlideAmtDef->getOpcode() != RISCV::ADDI || + SlideAmtDef->getOperand(2).getImm() != -AVL.getImm() || + !SlideAmtDef->getOperand(1).getReg().isVirtual()) + return std::nullopt; + return SlideAmtDef->getOperand(1); +} + DemandedVL RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const { const MachineInstr &UserMI = *UserOp.getParent(); @@ -1406,6 +1442,9 @@ RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const { return DemandedVL::vlmax(); } + if (auto VL = getMinimumVLForVSLIDEDOWN_VX(UserOp, MRI)) + return *VL; + if (RISCVII::readsPastVL( TII->get(RISCV::getRVVMCOpcode(UserMI.getOpcode())).TSFlags)) { LLVM_DEBUG(dbgs() << " Abort because used by unsafe instruction\n"); @@ -1624,6 +1663,7 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const { // All our checks passed. We can reduce VL. VLOp.ChangeToRegister(CommonVL->getReg(), false); + MRI->constrainRegClass(CommonVL->getReg(), &RISCV::GPRNoX0RegClass); return true; } diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-live-out.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-live-out.ll new file mode 100644 index 0000000000000..cf15fad5533b9 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-live-out.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s + +define i32 @loop_live_out(ptr %p, i64 %n) { +; CHECK-LABEL: loop_live_out: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mv a2, a0 +; CHECK-NEXT: .LBB0_1: # %loop +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli a3, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v8, (a2) +; CHECK-NEXT: sub a1, a1, a3 +; CHECK-NEXT: vadd.vi v8, v8, 1 +; CHECK-NEXT: vse32.v v8, (a2) +; CHECK-NEXT: slli a2, a3, 2 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: bnez a1, .LBB0_1 +; CHECK-NEXT: # %bb.2: # %exit +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a3 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + br label %loop + +loop: + %avl = phi i64 [%n, %entry], [%avl.next, %loop] + %gep = phi ptr [%p, %entry], [%gep.next, %loop] + %vl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true) + %x = call @llvm.vp.load(ptr %gep, splat (i1 true), i32 %vl) + %y = add %x, splat (i32 1) + call void @llvm.vp.store( %y, ptr %gep, splat (i1 true), i32 %vl) + %vl.zext = zext i32 %vl to i64 + %avl.next = sub i64 %avl, %vl.zext + %gep.next = getelementptr i32, ptr %p, i32 %vl + %ec = icmp eq i64 %avl.next, 0 + br i1 %ec, label %exit, label %loop + +exit: + %lastidx = sub i64 %vl.zext, 1 + %lastelt = extractelement %y, i64 %lastidx + ret i32 %lastelt +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir index 4d6d0e122b1cf..55d1c84d5f8d3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir @@ -778,3 +778,38 @@ body: | ; CHECK: DBG_VALUE %0:vr DBG_VALUE %0:vr ... +--- +name: vslidedown_vx +tracksRegLiveness: true +body: | + bb.0: + liveins: $x8 + ; CHECK-LABEL: name: vslidedown_vx + ; CHECK: liveins: $x8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %x:gprnox0 = COPY $x8 + ; CHECK-NEXT: %y:gprnox0 = ADDI %x, -1 + ; CHECK-NEXT: %v:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, %x, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: %w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %v, %y, 1, 5 /* e32 */, 0 /* tu, mu */ + %x:gpr = COPY $x8 + %y:gprnox0 = ADDI %x, -1 + %v:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 /* tu, mu */ + %w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %v, %y, 1, 5 /* e32 */, 0 /* tu, mu */ +... +--- +# Make sure we ignore LIs (ADDI $x0, -1) +name: vslidedown_vx_li +tracksRegLiveness: true +body: | + bb.0: + liveins: $x8 + ; CHECK-LABEL: name: vslidedown_vx_li + ; CHECK: liveins: $x8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %y:gprnox0 = ADDI $x0, -1 + ; CHECK-NEXT: %v:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: %w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %v, %y, 1, 5 /* e32 */, 0 /* tu, mu */ + %y:gprnox0 = ADDI $x0, -1 + %v:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 /* tu, mu */ + %w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %v, %y, 1, 5 /* e32 */, 0 /* tu, mu */ +... From 73545337f1bf320ffc8d1938c33ed587201f0c49 Mon Sep 17 00:00:00 2001 From: woruyu <1214539920@qq.com> Date: Tue, 18 Nov 2025 14:43:47 +0800 Subject: [PATCH 04/35] [CIR] X86 vector fcmp-sse vector builtins (#167125) ### Summary This PR resolves https://github.com/llvm/llvm-project/issues/163895. Just add fcmp-sse part of X86 vector builtins for CIR. --------- Co-authored-by: liuzhenya --- .../CIR/Dialect/Builder/CIRBaseBuilder.h | 18 ++ clang/include/clang/CIR/MissingFeatures.h | 1 + clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 73 ++++-- clang/test/CIR/CodeGen/builtin-fcmp-sse.c | 213 ++++++++++++++++++ 4 files changed, 290 insertions(+), 15 deletions(-) create mode 100644 clang/test/CIR/CodeGen/builtin-fcmp-sse.c diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h index 737ceac80635b..be9965ae3101f 100644 --- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h +++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h @@ -131,6 +131,14 @@ class CIRBaseBuilderTy : public mlir::OpBuilder { return cir::IntType::get(getContext(), n, false); } + static unsigned getCIRIntOrFloatBitWidth(mlir::Type eltTy) { + if (auto intType = mlir::dyn_cast(eltTy)) + return intType.getWidth(); + if (auto floatType = mlir::dyn_cast(eltTy)) + return floatType.getWidth(); + + llvm_unreachable("Unsupported type in getCIRIntOrFloatBitWidth"); + } cir::IntType getSIntNTy(int n) { return cir::IntType::get(getContext(), n, true); } @@ -565,6 +573,16 @@ class CIRBaseBuilderTy : public mlir::OpBuilder { return cir::CmpOp::create(*this, loc, getBoolTy(), kind, lhs, rhs); } + cir::VecCmpOp createVecCompare(mlir::Location loc, cir::CmpOpKind kind, + mlir::Value lhs, mlir::Value rhs) { + VectorType vecCast = mlir::cast(lhs.getType()); + IntType integralTy = + getSIntNTy(getCIRIntOrFloatBitWidth(vecCast.getElementType())); + VectorType integralVecTy = + VectorType::get(context, integralTy, vecCast.getSize()); + return cir::VecCmpOp::create(*this, loc, integralVecTy, kind, lhs, rhs); + } + mlir::Value createIsNaN(mlir::Location loc, mlir::Value operand) { return createCompare(loc, cir::CmpOpKind::ne, operand, operand); } diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h index d93ee2675b366..34c2476ffccce 100644 --- a/clang/include/clang/CIR/MissingFeatures.h +++ b/clang/include/clang/CIR/MissingFeatures.h @@ -259,6 +259,7 @@ struct MissingFeatures { static bool emitBranchThroughCleanup() { return false; } static bool emitCheckedInBoundsGEP() { return false; } static bool emitCondLikelihoodViaExpectIntrinsic() { return false; } + static bool emitConstrainedFPCall() { return false; } static bool emitLifetimeMarkers() { return false; } static bool emitLValueAlignmentAssumption() { return false; } static bool emitNullCheckForDeleteCalls() { return false; } diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp index ba160373ec77e..ee6900141647f 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp @@ -33,18 +33,53 @@ static mlir::Value emitIntrinsicCallOp(CIRGenFunction &cgf, const CallExpr *e, .getResult(); } +// OG has unordered comparison as a form of optimization in addition to +// ordered comparison, while CIR doesn't. +// +// This means that we can't encode the comparison code of UGT (unordered +// greater than), at least not at the CIR level. +// +// The boolean shouldInvert compensates for this. +// For example: to get to the comparison code UGT, we pass in +// emitVectorFCmp (OLE, shouldInvert = true) since OLE is the inverse of UGT. + +// There are several ways to support this otherwise: +// - register extra CmpOpKind for unordered comparison types and build the +// translation code for +// to go from CIR -> LLVM dialect. Notice we get this naturally with +// shouldInvert, benefiting from existing infrastructure, albeit having to +// generate an extra `not` at CIR). +// - Just add extra comparison code to a new VecCmpOpKind instead of +// cluttering CmpOpKind. +// - Add a boolean in VecCmpOp to indicate if it's doing unordered or ordered +// comparison +// - Just emit the intrinsics call instead of calling this helper, see how the +// LLVM lowering handles this. +static mlir::Value emitVectorFCmp(CIRGenBuilderTy &builder, + llvm::SmallVector &ops, + mlir::Location loc, cir::CmpOpKind pred, + bool shouldInvert) { + assert(!cir::MissingFeatures::cgFPOptionsRAII()); + // TODO(cir): Add isSignaling boolean once emitConstrainedFPCall implemented + assert(!cir::MissingFeatures::emitConstrainedFPCall()); + mlir::Value cmp = builder.createVecCompare(loc, pred, ops[0], ops[1]); + mlir::Value bitCast = builder.createBitcast( + shouldInvert ? builder.createNot(cmp) : cmp, ops[0].getType()); + return bitCast; +} + mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, - const CallExpr *e) { + const CallExpr *expr) { if (builtinID == Builtin::BI__builtin_cpu_is) { - cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_is"); + cgm.errorNYI(expr->getSourceRange(), "__builtin_cpu_is"); return {}; } if (builtinID == Builtin::BI__builtin_cpu_supports) { - cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_supports"); + cgm.errorNYI(expr->getSourceRange(), "__builtin_cpu_supports"); return {}; } if (builtinID == Builtin::BI__builtin_cpu_init) { - cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_init"); + cgm.errorNYI(expr->getSourceRange(), "__builtin_cpu_init"); return {}; } @@ -65,7 +100,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, getContext().GetBuiltinType(builtinID, error, &iceArguments); assert(error == ASTContext::GE_None && "Error while getting builtin type."); - for (auto [idx, arg] : llvm::enumerate(e->arguments())) + for (auto [idx, arg] : llvm::enumerate(expr->arguments())) ops.push_back(emitScalarOrConstFoldImmArg(iceArguments, idx, arg)); CIRGenBuilderTy &builder = getBuilder(); @@ -75,15 +110,15 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, default: return {}; case X86::BI_mm_clflush: - return emitIntrinsicCallOp(*this, e, "x86.sse2.clflush", voidTy, ops[0]); + return emitIntrinsicCallOp(*this, expr, "x86.sse2.clflush", voidTy, ops[0]); case X86::BI_mm_lfence: - return emitIntrinsicCallOp(*this, e, "x86.sse2.lfence", voidTy); + return emitIntrinsicCallOp(*this, expr, "x86.sse2.lfence", voidTy); case X86::BI_mm_pause: - return emitIntrinsicCallOp(*this, e, "x86.sse2.pause", voidTy); + return emitIntrinsicCallOp(*this, expr, "x86.sse2.pause", voidTy); case X86::BI_mm_mfence: - return emitIntrinsicCallOp(*this, e, "x86.sse2.mfence", voidTy); + return emitIntrinsicCallOp(*this, expr, "x86.sse2.mfence", voidTy); case X86::BI_mm_sfence: - return emitIntrinsicCallOp(*this, e, "x86.sse.sfence", voidTy); + return emitIntrinsicCallOp(*this, expr, "x86.sse.sfence", voidTy); case X86::BI_mm_prefetch: case X86::BI__rdtsc: case X86::BI__builtin_ia32_rdtscp: @@ -96,7 +131,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI__builtin_ia32_undef128: case X86::BI__builtin_ia32_undef256: case X86::BI__builtin_ia32_undef512: - cgm.errorNYI(e->getSourceRange(), + cgm.errorNYI(expr->getSourceRange(), std::string("unimplemented X86 builtin call: ") + getContext().BuiltinInfo.getName(builtinID)); return {}; @@ -118,12 +153,12 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, index &= numElts - 1; cir::ConstantOp indexVal = - builder.getUInt64(index, getLoc(e->getExprLoc())); + builder.getUInt64(index, getLoc(expr->getExprLoc())); // These builtins exist so we can ensure the index is an ICE and in range. // Otherwise we could just do this in the header file. - return cir::VecExtractOp::create(builder, getLoc(e->getExprLoc()), ops[0], - indexVal); + return cir::VecExtractOp::create(builder, getLoc(expr->getExprLoc()), + ops[0], indexVal); } case X86::BI__builtin_ia32_vec_set_v4hi: case X86::BI__builtin_ia32_vec_set_v16qi: @@ -758,10 +793,18 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI__builtin_ia32_cmpunordpd: case X86::BI__builtin_ia32_cmpneqps: case X86::BI__builtin_ia32_cmpneqpd: + cgm.errorNYI(expr->getSourceRange(), + std::string("unimplemented X86 builtin call: ") + + getContext().BuiltinInfo.getName(builtinID)); + return {}; case X86::BI__builtin_ia32_cmpnltps: case X86::BI__builtin_ia32_cmpnltpd: + return emitVectorFCmp(builder, ops, getLoc(expr->getExprLoc()), + cir::CmpOpKind::lt, /*shouldInvert=*/true); case X86::BI__builtin_ia32_cmpnleps: case X86::BI__builtin_ia32_cmpnlepd: + return emitVectorFCmp(builder, ops, getLoc(expr->getExprLoc()), + cir::CmpOpKind::le, /*shouldInvert=*/true); case X86::BI__builtin_ia32_cmpordps: case X86::BI__builtin_ia32_cmpordpd: case X86::BI__builtin_ia32_cmpph128_mask: @@ -846,7 +889,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3: case X86::BI__builtin_ia32_vfmaddcsh_round_mask3: case X86::BI__builtin_ia32_prefetchi: - cgm.errorNYI(e->getSourceRange(), + cgm.errorNYI(expr->getSourceRange(), std::string("unimplemented X86 builtin call: ") + getContext().BuiltinInfo.getName(builtinID)); return {}; diff --git a/clang/test/CIR/CodeGen/builtin-fcmp-sse.c b/clang/test/CIR/CodeGen/builtin-fcmp-sse.c new file mode 100644 index 0000000000000..c273d6b3fca0e --- /dev/null +++ b/clang/test/CIR/CodeGen/builtin-fcmp-sse.c @@ -0,0 +1,213 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir +// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll +// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll +// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG + +typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16))); +typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16))); + +__m128 test_cmpnleps(__m128 A, __m128 B) { + // CIR-LABEL: cir.func dso_local @test_cmpnleps( + // CIR: %[[ARG0:.*]]: !cir.vector<4 x !cir.float> {{.*}}, %[[ARG1:.*]]: !cir.vector<4 x !cir.float> {{.*}}) -> !cir.vector<4 x !cir.float> inline(never) { + // CIR: %[[ALLOCA_0:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr>, ["A", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_1:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr>, ["B", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_2:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr>, ["__retval"] {alignment = 16 : i64} + // CIR: cir.store %[[ARG0]], %[[ALLOCA_0]] : !cir.vector<4 x !cir.float>, !cir.ptr> + // CIR: cir.store %[[ARG1]], %[[ALLOCA_1]] : !cir.vector<4 x !cir.float>, !cir.ptr> + // CIR: %[[LOAD_0:.*]] = cir.load align(16) %[[ALLOCA_0]] : !cir.ptr>, !cir.vector<4 x !cir.float> + // CIR: %[[LOAD_1:.*]] = cir.load align(16) %[[ALLOCA_1]] : !cir.ptr>, !cir.vector<4 x !cir.float> + // CIR: %[[VEC_0:.*]] = cir.vec.cmp(le, %[[LOAD_0]], %[[LOAD_1]]) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i> + // CIR: %[[UNARY_0:.*]] = cir.unary(not, %[[VEC_0]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i> + // CIR: %[[CAST_0:.*]] = cir.cast bitcast %[[UNARY_0]] : !cir.vector<4 x !s32i> -> !cir.vector<4 x !cir.float> + // CIR: cir.store %[[CAST_0]], %[[ALLOCA_2]] : !cir.vector<4 x !cir.float>, !cir.ptr> + // CIR: %[[LOAD_2:.*]] = cir.load %[[ALLOCA_2]] : !cir.ptr>, !cir.vector<4 x !cir.float> + // CIR: cir.return %[[LOAD_2]] : !cir.vector<4 x !cir.float> + // CIR: } + + // LLVM-LABEL: define dso_local <4 x float> @test_cmpnleps( + // LLVM-SAME: <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { + // LLVM-NEXT: [[TMP3:%.*]] = alloca <4 x float>, i64 1, align 16 + // LLVM-NEXT: [[TMP4:%.*]] = alloca <4 x float>, i64 1, align 16 + // LLVM-NEXT: [[TMP5:%.*]] = alloca <4 x float>, i64 1, align 16 + // LLVM-NEXT: store <4 x float> [[TMP0]], ptr [[TMP3]], align 16 + // LLVM-NEXT: store <4 x float> [[TMP1]], ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP6:%.*]] = load <4 x float>, ptr [[TMP3]], align 16 + // LLVM-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP8:%.*]] = fcmp ole <4 x float> [[TMP6]], [[TMP7]] + // LLVM-NEXT: [[TMP9:%.*]] = sext <4 x i1> [[TMP8]] to <4 x i32> + // LLVM-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], splat (i32 -1) + // LLVM-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <4 x float> + // LLVM-NEXT: store <4 x float> [[TMP11]], ptr [[TMP5]], align 16 + // LLVM-NEXT: [[TMP12:%.*]] = load <4 x float>, ptr [[TMP5]], align 16 + // LLVM-NEXT: ret <4 x float> [[TMP12]] + + // OGCG-LABEL: define dso_local <4 x float> @test_cmpnleps( + // OGCG-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { + // OGCG-NEXT: [[ENTRY:.*:]] + // OGCG-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 + // OGCG-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 + // OGCG-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 + // OGCG-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 + // OGCG-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP2:%.*]] = fcmp ugt <4 x float> [[TMP0]], [[TMP1]] + // OGCG-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> + // OGCG-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float> + // OGCG-NEXT: ret <4 x float> [[TMP4]] + return __builtin_ia32_cmpnleps(A, B); +} + +__m128d test_cmpnlepd(__m128d A, __m128d B) { + // CIR-LABEL: cir.func dso_local @test_cmpnlepd( + // CIR: %[[ARG0:.*]]: !cir.vector<2 x !cir.double> {{.*}}, %[[ARG1:.*]]: !cir.vector<2 x !cir.double> {{.*}}) -> !cir.vector<2 x !cir.double> inline(never) { + // CIR: %[[ALLOCA_0:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr>, ["A", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_1:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr>, ["B", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_2:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr>, ["__retval"] {alignment = 16 : i64} + // CIR: cir.store %[[ARG0]], %[[ALLOCA_0]] : !cir.vector<2 x !cir.double>, !cir.ptr> + // CIR: cir.store %[[ARG1]], %[[ALLOCA_1]] : !cir.vector<2 x !cir.double>, !cir.ptr> + // CIR: %[[LOAD_0:.*]] = cir.load align(16) %[[ALLOCA_0]] : !cir.ptr>, !cir.vector<2 x !cir.double> + // CIR: %[[LOAD_1:.*]] = cir.load align(16) %[[ALLOCA_1]] : !cir.ptr>, !cir.vector<2 x !cir.double> + // CIR: %[[VEC_0:.*]] = cir.vec.cmp(le, %[[LOAD_0]], %[[LOAD_1]]) : !cir.vector<2 x !cir.double>, !cir.vector<2 x !s64i> + // CIR: %[[UNARY_0:.*]] = cir.unary(not, %[[VEC_0]]) : !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i> + // CIR: %[[CAST_0:.*]] = cir.cast bitcast %[[UNARY_0]] : !cir.vector<2 x !s64i> -> !cir.vector<2 x !cir.double> + // CIR: cir.store %[[CAST_0]], %[[ALLOCA_2]] : !cir.vector<2 x !cir.double>, !cir.ptr> + // CIR: %[[LOAD_2:.*]] = cir.load %[[ALLOCA_2]] : !cir.ptr>, !cir.vector<2 x !cir.double> + // CIR: cir.return %[[LOAD_2]] : !cir.vector<2 x !cir.double> + // CIR: } + + // LLVM-LABEL: define dso_local <2 x double> @test_cmpnlepd( + // LLVM-SAME: <2 x double> [[TMP0:%.*]], <2 x double> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { + // LLVM-NEXT: [[TMP3:%.*]] = alloca <2 x double>, i64 1, align 16 + // LLVM-NEXT: [[TMP4:%.*]] = alloca <2 x double>, i64 1, align 16 + // LLVM-NEXT: [[TMP5:%.*]] = alloca <2 x double>, i64 1, align 16 + // LLVM-NEXT: store <2 x double> [[TMP0]], ptr [[TMP3]], align 16 + // LLVM-NEXT: store <2 x double> [[TMP1]], ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[TMP3]], align 16 + // LLVM-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP8:%.*]] = fcmp ole <2 x double> [[TMP6]], [[TMP7]] + // LLVM-NEXT: [[TMP9:%.*]] = sext <2 x i1> [[TMP8]] to <2 x i64> + // LLVM-NEXT: [[TMP10:%.*]] = xor <2 x i64> [[TMP9]], splat (i64 -1) + // LLVM-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <2 x double> + // LLVM-NEXT: store <2 x double> [[TMP11]], ptr [[TMP5]], align 16 + // LLVM-NEXT: [[TMP12:%.*]] = load <2 x double>, ptr [[TMP5]], align 16 + // LLVM-NEXT: ret <2 x double> [[TMP12]] + + // OGCG-LABEL: define dso_local <2 x double> @test_cmpnlepd( + // OGCG-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { + // OGCG-NEXT: [[ENTRY:.*:]] + // OGCG-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 + // OGCG-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 + // OGCG-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 + // OGCG-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 + // OGCG-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP2:%.*]] = fcmp ugt <2 x double> [[TMP0]], [[TMP1]] + // OGCG-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> + // OGCG-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double> + // OGCG-NEXT: ret <2 x double> [[TMP4]] + return __builtin_ia32_cmpnlepd(A, B); +} + +__m128 test_cmpnltps(__m128 A, __m128 B) { + // CIR-LABEL: cir.func dso_local @test_cmpnltps( + // CIR-SAME: %[[ARG0:.*]]: !cir.vector<4 x !cir.float> {{.*}}, %[[ARG1:.*]]: !cir.vector<4 x !cir.float> {{.*}}) -> !cir.vector<4 x !cir.float> inline(never) { + // CIR: %[[ALLOCA_0:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr>, ["A", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_1:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr>, ["B", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_2:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr>, ["__retval"] {alignment = 16 : i64} + // CIR: cir.store %[[ARG0]], %[[ALLOCA_0]] : !cir.vector<4 x !cir.float>, !cir.ptr> + // CIR: cir.store %[[ARG1]], %[[ALLOCA_1]] : !cir.vector<4 x !cir.float>, !cir.ptr> + // CIR: %[[LOAD_0:.*]] = cir.load align(16) %[[ALLOCA_0]] : !cir.ptr>, !cir.vector<4 x !cir.float> + // CIR: %[[LOAD_1:.*]] = cir.load align(16) %[[ALLOCA_1]] : !cir.ptr>, !cir.vector<4 x !cir.float> + // CIR: %[[VEC_0:.*]] = cir.vec.cmp(lt, %[[LOAD_0]], %[[LOAD_1]]) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i> + // CIR: %[[UNARY_0:.*]] = cir.unary(not, %[[VEC_0]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i> + // CIR: %[[CAST_0:.*]] = cir.cast bitcast %[[UNARY_0]] : !cir.vector<4 x !s32i> -> !cir.vector<4 x !cir.float> + // CIR: cir.store %[[CAST_0]], %[[ALLOCA_2]] : !cir.vector<4 x !cir.float>, !cir.ptr> + // CIR: %[[LOAD_2:.*]] = cir.load %[[ALLOCA_2]] : !cir.ptr>, !cir.vector<4 x !cir.float> + // CIR: cir.return %[[LOAD_2]] : !cir.vector<4 x !cir.float> + // CIR: } + + // LLVM-LABEL: define dso_local <4 x float> @test_cmpnltps( + // LLVM-SAME: <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { + // LLVM-NEXT: [[TMP3:%.*]] = alloca <4 x float>, i64 1, align 16 + // LLVM-NEXT: [[TMP4:%.*]] = alloca <4 x float>, i64 1, align 16 + // LLVM-NEXT: [[TMP5:%.*]] = alloca <4 x float>, i64 1, align 16 + // LLVM-NEXT: store <4 x float> [[TMP0]], ptr [[TMP3]], align 16 + // LLVM-NEXT: store <4 x float> [[TMP1]], ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP6:%.*]] = load <4 x float>, ptr [[TMP3]], align 16 + // LLVM-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP8:%.*]] = fcmp olt <4 x float> [[TMP6]], [[TMP7]] + // LLVM-NEXT: [[TMP9:%.*]] = sext <4 x i1> [[TMP8]] to <4 x i32> + // LLVM-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], splat (i32 -1) + // LLVM-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <4 x float> + // LLVM-NEXT: store <4 x float> [[TMP11]], ptr [[TMP5]], align 16 + // LLVM-NEXT: [[TMP12:%.*]] = load <4 x float>, ptr [[TMP5]], align 16 + // LLVM-NEXT: ret <4 x float> [[TMP12]] + + // OGCG-LABEL: define dso_local <4 x float> @test_cmpnltps( + // OGCG-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { + // OGCG-NEXT: [[ENTRY:.*:]] + // OGCG-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 + // OGCG-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 + // OGCG-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 + // OGCG-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 + // OGCG-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP2:%.*]] = fcmp uge <4 x float> [[TMP0]], [[TMP1]] + // OGCG-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> + // OGCG-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float> + // OGCG-NEXT: ret <4 x float> [[TMP4]] + return __builtin_ia32_cmpnltps(A, B); +} + +__m128d test_cmpnltpd(__m128d A, __m128d B) { + // CIR-LABEL: cir.func dso_local @test_cmpnltpd( + // CIR: %[[ARG0:.*]]: !cir.vector<2 x !cir.double> {{.*}}, %[[ARG1:.*]]: !cir.vector<2 x !cir.double> {{.*}}) -> !cir.vector<2 x !cir.double> inline(never) { + // CIR: %[[ALLOCA_0:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr>, ["A", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_1:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr>, ["B", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_2:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr>, ["__retval"] {alignment = 16 : i64} + // CIR: cir.store %[[ARG0]], %[[ALLOCA_0]] : !cir.vector<2 x !cir.double>, !cir.ptr> + // CIR: cir.store %[[ARG1]], %[[ALLOCA_1]] : !cir.vector<2 x !cir.double>, !cir.ptr> + // CIR: %[[LOAD_0:.*]] = cir.load align(16) %[[ALLOCA_0]] : !cir.ptr>, !cir.vector<2 x !cir.double> + // CIR: %[[LOAD_1:.*]] = cir.load align(16) %[[ALLOCA_1]] : !cir.ptr>, !cir.vector<2 x !cir.double> + // CIR: %[[VEC_0:.*]] = cir.vec.cmp(lt, %[[LOAD_0]], %[[LOAD_1]]) : !cir.vector<2 x !cir.double>, !cir.vector<2 x !s64i> + // CIR: %[[UNARY_0:.*]] = cir.unary(not, %[[VEC_0]]) : !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i> + // CIR: %[[CAST_0:.*]] = cir.cast bitcast %[[UNARY_0]] : !cir.vector<2 x !s64i> -> !cir.vector<2 x !cir.double> + // CIR: cir.store %[[CAST_0]], %[[ALLOCA_2]] : !cir.vector<2 x !cir.double>, !cir.ptr> + // CIR: %[[LOAD_2:.*]] = cir.load %[[ALLOCA_2]] : !cir.ptr>, !cir.vector<2 x !cir.double> + // CIR: cir.return %[[LOAD_2]] : !cir.vector<2 x !cir.double> + // CIR: } + + // LLVM-LABEL: define dso_local <2 x double> @test_cmpnltpd( + // LLVM-SAME: <2 x double> [[TMP0:%.*]], <2 x double> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { + // LLVM-NEXT: [[TMP3:%.*]] = alloca <2 x double>, i64 1, align 16 + // LLVM-NEXT: [[TMP4:%.*]] = alloca <2 x double>, i64 1, align 16 + // LLVM-NEXT: [[TMP5:%.*]] = alloca <2 x double>, i64 1, align 16 + // LLVM-NEXT: store <2 x double> [[TMP0]], ptr [[TMP3]], align 16 + // LLVM-NEXT: store <2 x double> [[TMP1]], ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[TMP3]], align 16 + // LLVM-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP6]], [[TMP7]] + // LLVM-NEXT: [[TMP9:%.*]] = sext <2 x i1> [[TMP8]] to <2 x i64> + // LLVM-NEXT: [[TMP10:%.*]] = xor <2 x i64> [[TMP9]], splat (i64 -1) + // LLVM-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <2 x double> + // LLVM-NEXT: store <2 x double> [[TMP11]], ptr [[TMP5]], align 16 + // LLVM-NEXT: [[TMP12:%.*]] = load <2 x double>, ptr [[TMP5]], align 16 + // LLVM-NEXT: ret <2 x double> [[TMP12]] + + // OGCG-LABEL: define dso_local <2 x double> @test_cmpnltpd( + // OGCG-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { + // OGCG-NEXT: [[ENTRY:.*:]] + // OGCG-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 + // OGCG-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 + // OGCG-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 + // OGCG-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 + // OGCG-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP2:%.*]] = fcmp uge <2 x double> [[TMP0]], [[TMP1]] + // OGCG-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> + // OGCG-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double> + // OGCG-NEXT: ret <2 x double> [[TMP4]] + return __builtin_ia32_cmpnltpd(A, B); +} From 886d24d03adb25f9309a60f9eb59552bb9df2587 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Tue, 18 Nov 2025 07:46:46 +0100 Subject: [PATCH 05/35] [clang][bytecode] Fix fallthrough to switch labels (#168484) We need to fallthrough here in case we're not jumping to the labels. This is only needed in expression contexts. --- clang/lib/AST/ByteCode/Compiler.cpp | 2 ++ clang/test/AST/ByteCode/literals.cpp | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index f8bbfed8bb387..8779ffab13b86 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -6057,6 +6057,7 @@ bool Compiler::visitSwitchStmt(const SwitchStmt *S) { DefaultLabel); if (!this->visitStmt(S->getBody())) return false; + this->fallthrough(EndLabel); this->emitLabel(EndLabel); return LS.destroyLocals(); @@ -6064,6 +6065,7 @@ bool Compiler::visitSwitchStmt(const SwitchStmt *S) { template bool Compiler::visitCaseStmt(const CaseStmt *S) { + this->fallthrough(CaseLabels[S]); this->emitLabel(CaseLabels[S]); return this->visitStmt(S->getSubStmt()); } diff --git a/clang/test/AST/ByteCode/literals.cpp b/clang/test/AST/ByteCode/literals.cpp index 5028ebfa3de30..c6d79f9c60058 100644 --- a/clang/test/AST/ByteCode/literals.cpp +++ b/clang/test/AST/ByteCode/literals.cpp @@ -1270,6 +1270,17 @@ namespace StmtExprs { namespace CrossFuncLabelDiff { constexpr long a(bool x) { return x ? 0 : (intptr_t)&&lbl + (0 && ({lbl: 0;})); } } + + /// GCC agrees with the bytecode interpreter here. + void switchInSE() { + static_assert(({ // ref-error {{not an integral constant expression}} + int i = 20; + switch(10) { + case 10: i = 300; // ref-note {{a constant expression cannot modify an object that is visible outside that expression}} + } + i; + }) == 300); + } } #endif From f15b756b56d0653181f062901916806bc5eba280 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Tue, 18 Nov 2025 17:51:37 +1100 Subject: [PATCH 06/35] [ORC] Remove unnecessary LLVM_ABI on function def. NFCI. (#168478) --- llvm/lib/ExecutionEngine/Orc/MachO.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/ExecutionEngine/Orc/MachO.cpp b/llvm/lib/ExecutionEngine/Orc/MachO.cpp index 731d24d1272d4..6b0f96da19dc6 100644 --- a/llvm/lib/ExecutionEngine/Orc/MachO.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MachO.cpp @@ -282,7 +282,7 @@ Expected ForceLoadMachOArchiveMembers::operator()( return true; } -LLVM_ABI SmallVector> +SmallVector> noFallbackArchs(uint32_t CPUType, uint32_t CPUSubType) { SmallVector> Result; Result.push_back({CPUType, CPUSubType}); From 6886d4945f8c46b64fecf2fa6708128bcee8cadc Mon Sep 17 00:00:00 2001 From: Mikhail Gudim Date: Tue, 18 Nov 2025 02:46:43 -0500 Subject: [PATCH 07/35] [RISCV] Add an option to enable CFIInstrInserter. (#164477) --- llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 9 +++++ llvm/lib/Target/RISCV/RISCVFrameLowering.h | 3 ++ llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 10 +++++- llvm/test/CodeGen/RISCV/pipeline-options.ll | 35 ++++++++++++++++++++ 4 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/RISCV/pipeline-options.ll diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index f7fc9528920a6..75e7cf347e461 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -2507,3 +2507,12 @@ void RISCVFrameLowering::inlineStackProbe(MachineFunction &MF, } } } + +int RISCVFrameLowering::getInitialCFAOffset(const MachineFunction &MF) const { + return 0; +} + +Register +RISCVFrameLowering::getInitialCFARegister(const MachineFunction &MF) const { + return RISCV::X2; +} diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h index 6af63a4885f35..87980dfb09f96 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h @@ -23,6 +23,9 @@ class RISCVFrameLowering : public TargetFrameLowering { public: explicit RISCVFrameLowering(const RISCVSubtarget &STI); + int getInitialCFAOffset(const MachineFunction &MF) const override; + Register getInitialCFARegister(const MachineFunction &MF) const override; + void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 16ef67da83128..911bd7ee2876f 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -103,6 +103,11 @@ static cl::opt cl::desc("Enable Machine Pipeliner for RISC-V"), cl::init(false), cl::Hidden); +static cl::opt EnableCFIInstrInserter( + "riscv-enable-cfi-instr-inserter", + cl::desc("Enable CFI Instruction Inserter for RISC-V"), cl::init(false), + cl::Hidden); + extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { RegisterTargetMachine X(getTheRISCV32Target()); RegisterTargetMachine Y(getTheRISCV64Target()); @@ -169,7 +174,7 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT, if (TT.isOSFuchsia() && !TT.isArch64Bit()) report_fatal_error("Fuchsia is only supported for 64-bit"); - setCFIFixup(true); + setCFIFixup(!EnableCFIInstrInserter); } const RISCVSubtarget * @@ -578,6 +583,9 @@ void RISCVPassConfig::addPreEmitPass2() { addPass(createUnpackMachineBundles([&](const MachineFunction &MF) { return MF.getFunction().getParent()->getModuleFlag("kcfi"); })); + + if (EnableCFIInstrInserter) + addPass(createCFIInstrInserter()); } void RISCVPassConfig::addMachineSSAOptimization() { diff --git a/llvm/test/CodeGen/RISCV/pipeline-options.ll b/llvm/test/CodeGen/RISCV/pipeline-options.ll new file mode 100644 index 0000000000000..26c9aaba09c94 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/pipeline-options.ll @@ -0,0 +1,35 @@ +; RUN: llc -mtriple=riscv64 -O3 \ +; RUN: -debug-pass=Structure < %s -o /dev/null 2>&1 | \ +; RUN: FileCheck %s --check-prefix=O3-WITHOUT-ENABLE-CFI-INSTR-INSERTER + +; RUN: llc -mtriple=riscv64 -O3 \ +; RUN: --riscv-enable-cfi-instr-inserter=true \ +; RUN: -debug-pass=Structure < %s -o /dev/null 2>&1 | \ +; RUN: FileCheck %s --check-prefix=O3-ENABLE-CFI-INSTR-INSERTER + +; RUN: llc -mtriple=riscv64 -O0 \ +; RUN: -debug-pass=Structure < %s -o /dev/null 2>&1 | \ +; RUN: FileCheck %s --check-prefix=O0-WITHOUT-ENABLE-CFI-INSTR-INSERTER + +; RUN: llc -mtriple=riscv64 -O0 \ +; RUN: --riscv-enable-cfi-instr-inserter=true \ +; RUN: -debug-pass=Structure < %s -o /dev/null 2>&1 | \ +; RUN: FileCheck %s --check-prefix=O0-ENABLE-CFI-INSTR-INSERTER + +; REQUIRES: asserts + +; O3-WITHOUT-ENABLE-CFI-INSTR-INSERTER-LABEL: Pass Arguments: +; NO-O3-WITHOUT-ENABLE-CFI-INSTR-INSERTER: Check CFA info and insert CFI instructions if needed +; O3-WITHOUT-ENABLE-CFI-INSTR-INSERTER: Insert CFI remember/restore state instructions + +; O3-ENABLE-CFI-INSTR-INSERTER-LABEL: Pass Arguments: +; O3-ENABLE-CFI-INSTR-INSERTER: Check CFA info and insert CFI instructions if needed +; NO-O3-ENABLE-CFI-INSTR-INSERTER: Insert CFI remember/restore state instructions + +; O0-WITHOUT-ENABLE-CFI-INSTR-INSERTER-LABEL: Pass Arguments: +; NO-O0-WITHOUT-ENABLE-CFI-INSTR-INSERTER: Check CFA info and insert CFI instructions if needed +; O0-WITHOUT-ENABLE-CFI-INSTR-INSERTER: Insert CFI remember/restore state instructions + +; O0-ENABLE-CFI-INSTR-INSERTER-LABEL: Pass Arguments: +; O0-ENABLE-CFI-INSTR-INSERTER: Check CFA info and insert CFI instructions if needed +; NO-O0-ENABLE-CFI-INSTR-INSERTER: Insert CFI remember/restore state instructions From 5327c6b57efba007350ab961899150d3fbfe5b45 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Tue, 18 Nov 2025 16:08:27 +0800 Subject: [PATCH 08/35] [mlir][SCF] Add pass option to deactivate pattern rollback (#168481) Add a pass option to `convert-scf-to-cf` to deactivate pattern rollback for better performance. The lowering patterns from SCF->CF to benefit a lot from this feature because `splitBlock` is expensive in the rollback driver. --- mlir/include/mlir/Conversion/Passes.td | 4 ++++ mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp | 7 +++++-- mlir/test/Conversion/SCFToControlFlow/convert-to-cfg.mlir | 1 + 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index 0164a2fb9fa81..75ab4b64b7f38 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -1086,6 +1086,10 @@ def SCFToControlFlowPass : Pass<"convert-scf-to-cf"> { let summary = "Convert SCF dialect to ControlFlow dialect, replacing structured" " control flow with a CFG"; let dependentDialects = ["cf::ControlFlowDialect"]; + let options = [ + Option<"allowPatternRollback", "allow-pattern-rollback", "bool", "true", + "Experimental performance flag to disallow pattern rollback"> + ]; } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp b/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp index 37cfc9f2c23e6..03842cc9bd3a0 100644 --- a/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp +++ b/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp @@ -36,6 +36,7 @@ namespace { struct SCFToControlFlowPass : public impl::SCFToControlFlowPassBase { + using Base::Base; void runOnOperation() override; }; @@ -736,7 +737,9 @@ void SCFToControlFlowPass::runOnOperation() { target.addIllegalOp(); target.markUnknownOpDynamicallyLegal([](Operation *) { return true; }); - if (failed( - applyPartialConversion(getOperation(), target, std::move(patterns)))) + ConversionConfig config; + config.allowPatternRollback = allowPatternRollback; + if (failed(applyPartialConversion(getOperation(), target, std::move(patterns), + config))) signalPassFailure(); } diff --git a/mlir/test/Conversion/SCFToControlFlow/convert-to-cfg.mlir b/mlir/test/Conversion/SCFToControlFlow/convert-to-cfg.mlir index 483c7b35c6ec8..0c4f20e8d1a04 100644 --- a/mlir/test/Conversion/SCFToControlFlow/convert-to-cfg.mlir +++ b/mlir/test/Conversion/SCFToControlFlow/convert-to-cfg.mlir @@ -1,4 +1,5 @@ // RUN: mlir-opt -allow-unregistered-dialect -convert-scf-to-cf -split-input-file %s | FileCheck %s +// RUN: mlir-opt -allow-unregistered-dialect -convert-scf-to-cf="allow-pattern-rollback=0" -split-input-file %s | FileCheck %s // CHECK-LABEL: func @simple_std_for_loop(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index) { // CHECK-NEXT: cf.br ^bb1(%{{.*}} : index) From a61889580e5244a7a25610bc23b9a0d7f69e1200 Mon Sep 17 00:00:00 2001 From: Michael Bedy Date: Tue, 18 Nov 2025 03:35:29 -0500 Subject: [PATCH 09/35] [SLP] Invariant loads cannot have a memory dependency on stores. (#167929) --- .../Transforms/Vectorize/SLPVectorizer.cpp | 11 ++ .../AMDGPU/invariant-load-no-alias-store.ll | 121 ++++++++++++++++++ 2 files changed, 132 insertions(+) create mode 100644 llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index ff7149044d199..deb8ee2d88055 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -21479,7 +21479,18 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, "new ScheduleData already in scheduling region"); SD->init(SchedulingRegionID, I); + auto CanIgnoreLoad = [](const Instruction *I) { + const auto *LI = dyn_cast(I); + // If there is a simple load marked as invariant, we can ignore it. + // But, in the (unlikely) case of non-simple invariant load, + // we should not ignore it. + return LI && LI->isSimple() && + LI->getMetadata(LLVMContext::MD_invariant_load); + }; + if (I->mayReadOrWriteMemory() && + // Simple InvariantLoad does not depend on other memory accesses. + !CanIgnoreLoad(I) && (!isa(I) || (cast(I)->getIntrinsicID() != Intrinsic::sideeffect && cast(I)->getIntrinsicID() != diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll new file mode 100644 index 0000000000000..87537c05573ae --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll @@ -0,0 +1,121 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes="function(slp-vectorizer)" -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 %s -S | FileCheck %s + +define void @test(ptr addrspace(1) %base, ptr addrspace(1) %otherA, ptr addrspace(1) %otherB) #0 { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr addrspace(1) [[BASE:%.*]], ptr addrspace(1) [[OTHERA:%.*]], ptr addrspace(1) [[OTHERB:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[P0:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 0 +; CHECK-NEXT: [[A0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 0 +; CHECK-NEXT: [[B0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[A0PTR]], align 2, !invariant.load [[META0:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[B0PTR]], align 2, !invariant.load [[META0]] +; CHECK-NEXT: [[TMP2:%.*]] = fadd reassoc <2 x half> [[TMP0]], [[TMP1]] +; CHECK-NEXT: store <2 x half> [[TMP2]], ptr addrspace(1) [[P0]], align 2 +; CHECK-NEXT: ret void +; +entry: + %p0 = getelementptr half, ptr addrspace(1) %base, i32 0 + %p1 = getelementptr half, ptr addrspace(1) %base, i32 1 + ; First pair of invariant loads from otherA. + %A0PTR = getelementptr half, ptr addrspace(1) %otherA, i32 0 + %B0PTR = getelementptr half, ptr addrspace(1) %otherB, i32 0 + %A0 = load half, ptr addrspace(1) %A0PTR, align 2, !invariant.load !0 + %B0 = load half, ptr addrspace(1) %B0PTR, align 2, !invariant.load !0 + %add0 = fadd reassoc half %A0, %B0 + store half %add0, ptr addrspace(1) %p0, align 2 + %A1PTR = getelementptr half, ptr addrspace(1) %otherA, i32 1 + %B1PTR = getelementptr half, ptr addrspace(1) %otherB, i32 1 + %A1 = load half, ptr addrspace(1) %A1PTR, align 2, !invariant.load !0 + %B1 = load half, ptr addrspace(1) %B1PTR, align 2, !invariant.load !0 + %add1 = fadd reassoc half %A1, %B1 + store half %add1, ptr addrspace(1) %p1, align 2 + ret void +} + + +define void @aliastest(ptr addrspace(1) %base, ptr addrspace(1) %otherA, ptr addrspace(1) %otherB) #0 { +; CHECK-LABEL: define void @aliastest( +; CHECK-SAME: ptr addrspace(1) [[BASE:%.*]], ptr addrspace(1) [[OTHERA:%.*]], ptr addrspace(1) [[OTHERB:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[P0:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 0 +; CHECK-NEXT: [[P1:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 1 +; CHECK-NEXT: [[A0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 0 +; CHECK-NEXT: [[B0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 0 +; CHECK-NEXT: [[A0:%.*]] = load half, ptr addrspace(1) [[A0PTR]], align 2 +; CHECK-NEXT: [[B0:%.*]] = load half, ptr addrspace(1) [[B0PTR]], align 2 +; CHECK-NEXT: [[ADD0:%.*]] = fadd reassoc half [[A0]], [[B0]] +; CHECK-NEXT: store half [[ADD0]], ptr addrspace(1) [[P0]], align 2 +; CHECK-NEXT: [[A1PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 1 +; CHECK-NEXT: [[B1PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 1 +; CHECK-NEXT: [[A1:%.*]] = load half, ptr addrspace(1) [[A1PTR]], align 2 +; CHECK-NEXT: [[B1:%.*]] = load half, ptr addrspace(1) [[B1PTR]], align 2 +; CHECK-NEXT: [[ADD1:%.*]] = fadd reassoc half [[A1]], [[B1]] +; CHECK-NEXT: store half [[ADD1]], ptr addrspace(1) [[P1]], align 2 +; CHECK-NEXT: ret void +; +entry: + %p0 = getelementptr half, ptr addrspace(1) %base, i32 0 + %p1 = getelementptr half, ptr addrspace(1) %base, i32 1 + ; First pair of invariant loads from otherA. + %A0PTR = getelementptr half, ptr addrspace(1) %otherA, i32 0 + %B0PTR = getelementptr half, ptr addrspace(1) %otherB, i32 0 + %A0 = load half, ptr addrspace(1) %A0PTR, align 2 + %B0 = load half, ptr addrspace(1) %B0PTR, align 2 + %add0 = fadd reassoc half %A0, %B0 + store half %add0, ptr addrspace(1) %p0, align 2 + %A1PTR = getelementptr half, ptr addrspace(1) %otherA, i32 1 + %B1PTR = getelementptr half, ptr addrspace(1) %otherB, i32 1 + %A1 = load half, ptr addrspace(1) %A1PTR, align 2 + %B1 = load half, ptr addrspace(1) %B1PTR, align 2 + %add1 = fadd reassoc half %A1, %B1 + store half %add1, ptr addrspace(1) %p1, align 2 + ret void +} + +define void @voltest(ptr addrspace(1) %base, ptr addrspace(1) %otherA, ptr addrspace(1) %otherB) #0 { +; CHECK-LABEL: define void @voltest( +; CHECK-SAME: ptr addrspace(1) [[BASE:%.*]], ptr addrspace(1) [[OTHERA:%.*]], ptr addrspace(1) [[OTHERB:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[P0:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 0 +; CHECK-NEXT: [[P1:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 1 +; CHECK-NEXT: [[A0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 0 +; CHECK-NEXT: [[B0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 0 +; CHECK-NEXT: [[A0:%.*]] = load volatile half, ptr addrspace(1) [[A0PTR]], align 2, !invariant.load [[META0]] +; CHECK-NEXT: [[B0:%.*]] = load volatile half, ptr addrspace(1) [[B0PTR]], align 2, !invariant.load [[META0]] +; CHECK-NEXT: [[ADD0:%.*]] = fadd reassoc half [[A0]], [[B0]] +; CHECK-NEXT: store half [[ADD0]], ptr addrspace(1) [[P0]], align 2 +; CHECK-NEXT: [[A1PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 1 +; CHECK-NEXT: [[B1PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 1 +; CHECK-NEXT: [[A1:%.*]] = load volatile half, ptr addrspace(1) [[A1PTR]], align 2, !invariant.load [[META0]] +; CHECK-NEXT: [[B1:%.*]] = load volatile half, ptr addrspace(1) [[B1PTR]], align 2, !invariant.load [[META0]] +; CHECK-NEXT: [[ADD1:%.*]] = fadd reassoc half [[A1]], [[B1]] +; CHECK-NEXT: store half [[ADD1]], ptr addrspace(1) [[P1]], align 2 +; CHECK-NEXT: ret void +; +entry: + %p0 = getelementptr half, ptr addrspace(1) %base, i32 0 + %p1 = getelementptr half, ptr addrspace(1) %base, i32 1 + ; First pair of invariant loads from otherA. + %A0PTR = getelementptr half, ptr addrspace(1) %otherA, i32 0 + %B0PTR = getelementptr half, ptr addrspace(1) %otherB, i32 0 + %A0 = load volatile half, ptr addrspace(1) %A0PTR, align 2, !invariant.load !0 + %B0 = load volatile half, ptr addrspace(1) %B0PTR, align 2, !invariant.load !0 + %add0 = fadd reassoc half %A0, %B0 + store half %add0, ptr addrspace(1) %p0, align 2 + %A1PTR = getelementptr half, ptr addrspace(1) %otherA, i32 1 + %B1PTR = getelementptr half, ptr addrspace(1) %otherB, i32 1 + %A1 = load volatile half, ptr addrspace(1) %A1PTR, align 2, !invariant.load !0 + %B1 = load volatile half, ptr addrspace(1) %B1PTR, align 2, !invariant.load !0 + %add1 = fadd reassoc half %A1, %B1 + store half %add1, ptr addrspace(1) %p1, align 2 + ret void +} + + +attributes #0 = { nounwind } + +!0 = !{} +;. +; CHECK: [[META0]] = !{} +;. From 9f6932138569a0c2267076cb17022f58f07059ab Mon Sep 17 00:00:00 2001 From: John Harrison Date: Tue, 18 Nov 2025 00:42:19 -0800 Subject: [PATCH 10/35] Improve error response message parsing for DAP evaluate requests. (#168430) Updated the evaluate handler to check for DAP ErrorResponse bodies, which are used to display user errors if a request fails. This was updated in PR https://github.com/llvm/llvm-project/pull/167720 This should fix https://lab.llvm.org/buildbot/#/builders/163 --- .../debuginfo-tests/dexter/dex/debugger/DAP.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DAP.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DAP.py index a849990678d42..792e0be629fc4 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DAP.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DAP.py @@ -335,6 +335,7 @@ def send_message(self, payload: dict) -> int: self._proc.stdin.flush() return self.seq + @staticmethod def _handle_message( message: dict, debugger_state: DAPDebuggerState, logger: Logger ): @@ -419,6 +420,7 @@ def _handle_message( request_seq = message["request_seq"] debugger_state.set_response(request_seq, message) + @staticmethod def _colorize_dap_message(message: dict) -> dict: colorized_message = copy.deepcopy(message) if colorized_message["type"] == "event": @@ -432,6 +434,7 @@ def _colorize_dap_message(message: dict) -> dict: colorized_message["command"] = f"{colorized_message['command']}" return colorized_message + @staticmethod def _read_dap_output( proc: subprocess.Popen, debugger_state: DAPDebuggerState, @@ -454,6 +457,7 @@ def _read_dap_output( DAP._handle_message(message, debugger_state, logger) buffer = rest[content_length:] + @staticmethod def _read_dap_err(proc: subprocess.Popen, logger: Logger): while True: err: bytes = proc.stderr.readline() @@ -930,10 +934,16 @@ def evaluate_expression(self, expression, frame_idx=0) -> ValueIR: ) ) eval_response = self._await_response(eval_req_id) + result: str = "" if not eval_response["success"]: - result: str = eval_response["message"] + if eval_response["body"].get("error", None): + result = eval_response["body"]["error"]["format"] + elif eval_response["message"]: + result = eval_response["message"] + else: + result = "" else: - result: str = eval_response["body"]["result"] + result = eval_response["body"]["result"] type_str = eval_response["body"].get("type") return self._evaluate_result_value(expression, result, type_str) From ee1abb8d80691e5ef24da3273587dc789c9c6f1b Mon Sep 17 00:00:00 2001 From: Robert Imschweiler Date: Tue, 18 Nov 2025 09:55:36 +0100 Subject: [PATCH 11/35] [AMDGPU][clang] Fix clang driver check for multiple sanitizer arguments (#166851) `-fsanitize=address,fuzzer` should be rejected like `-fsanitize=fuzzer,address`. The address sanitizer enables the device sanitizer pipeline. The fuzzer implicitly turns on LLVMs SanitizerCoverage, which the driver then forwards to the device cc1. SanitizerCoverage is not supported on amdgcn. --- .../clang/Basic/DiagnosticDriverKinds.td | 10 +++ clang/include/clang/Options/Options.td | 31 +++++-- clang/lib/Driver/ToolChains/AMDGPU.cpp | 32 +++---- clang/lib/Driver/ToolChains/AMDGPU.h | 83 ++++++++++++++++--- clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp | 20 ++--- clang/lib/Driver/ToolChains/HIPAMD.cpp | 10 +-- .../Driver/amdgpu-openmp-sanitize-options.c | 51 ++++++++++++ clang/test/Driver/hip-sanitize-options.hip | 73 ++++++++++++++++ 8 files changed, 254 insertions(+), 56 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td index 98e08c2faa59e..f262db55a0d92 100644 --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -126,6 +126,9 @@ def err_drv_bad_offload_arch_combo : Error< "invalid offload arch combinations: '%0' and '%1' (for a specific processor, " "a feature should either exist in all offload archs, or not exist in any " "offload archs)">; +def err_drv_unsupported_option_for_offload_arch_req_feature : Error< + "'%0' option for offload arch '%1' is not currently supported " + "there. Use it with an offload arch containing '%2' instead">; def warn_drv_unsupported_option_for_offload_arch_req_feature : Warning< "ignoring '%0' option for offload arch '%1' as it is not currently supported " "there. Use it with an offload arch containing '%2' instead">, @@ -133,6 +136,13 @@ def warn_drv_unsupported_option_for_offload_arch_req_feature : Warning< def warn_drv_unsupported_option_for_target : Warning< "ignoring '%0' option as it is not currently supported for target '%1'">, InGroup; +def err_drv_unsupported_option_for_target : Error< + "'%0' option is not currently supported for target '%1'">; +def warn_drv_unsupported_option_part_for_target : Warning< + "ignoring '%0' in '%1' option as it is not currently supported for target '%2'">, + InGroup; +def err_drv_unsupported_option_part_for_target : Error< + "'%0' in '%1' option is not currently supported for target '%2'">; def warn_drv_invalid_argument_for_flang : Warning< "'%0' is not valid for Fortran">, InGroup; diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td index 2f7434d8afe11..cda11fdc94230 100644 --- a/clang/include/clang/Options/Options.td +++ b/clang/include/clang/Options/Options.td @@ -201,6 +201,10 @@ def hlsl_Group : OptionGroup<"">, Group, DocName<"HLSL options">, Visibility<[ClangOption]>; +def fsan_cov_Group : OptionGroup<"<-fsanitize-coverage group>">, + Group, + DocName<"Sanitizer Coverage options">; + // Feature groups - these take command line options that correspond directly to // target specific features and can be translated directly from command line // options. @@ -2407,26 +2411,26 @@ def : Flag<["-"], "fno-sanitize-blacklist">, Group, Flags<[HelpHidden]>, Alias; def fsanitize_coverage : CommaJoined<["-"], "fsanitize-coverage=">, - Group, + Group, HelpText<"Specify the type of coverage instrumentation for Sanitizers">; def fno_sanitize_coverage : CommaJoined<["-"], "fno-sanitize-coverage=">, - Group, Visibility<[ClangOption, CLOption]>, + Group, Visibility<[ClangOption, CLOption]>, HelpText<"Disable features of coverage instrumentation for Sanitizers">, Values<"func,bb,edge,indirect-calls,trace-bb,trace-cmp,trace-div,trace-gep," "8bit-counters,trace-pc,trace-pc-guard,no-prune,inline-8bit-counters," "inline-bool-flag">; def fsanitize_coverage_allowlist : Joined<["-"], "fsanitize-coverage-allowlist=">, - Group, Visibility<[ClangOption, CLOption]>, + Group, Visibility<[ClangOption, CLOption]>, HelpText<"Restrict sanitizer coverage instrumentation exclusively to modules and functions that match the provided special case list, except the blocked ones">, MarshallingInfoStringVector>; def fsanitize_coverage_ignorelist : Joined<["-"], "fsanitize-coverage-ignorelist=">, - Group, Visibility<[ClangOption, CLOption]>, + Group, Visibility<[ClangOption, CLOption]>, HelpText<"Disable sanitizer coverage instrumentation for modules and functions " "that match the provided special case list, even the allowed ones">, MarshallingInfoStringVector>; def fsanitize_coverage_stack_depth_callback_min_EQ : Joined<["-"], "fsanitize-coverage-stack-depth-callback-min=">, - Group, + Group, MetaVarName<"">, HelpText<"Use callback for max stack depth tracing with minimum stack " "depth M">, @@ -7901,70 +7905,87 @@ def linker_option : Joined<["--"], "linker-option=">, HelpText<"Add linker option">, MarshallingInfoStringVector>; def fsanitize_coverage_type : Joined<["-"], "fsanitize-coverage-type=">, + Group, HelpText<"Sanitizer coverage type">, MarshallingInfoInt>; def fsanitize_coverage_indirect_calls : Flag<["-"], "fsanitize-coverage-indirect-calls">, + Group, HelpText<"Enable sanitizer coverage for indirect calls">, MarshallingInfoFlag>; def fsanitize_coverage_trace_bb : Flag<["-"], "fsanitize-coverage-trace-bb">, + Group, HelpText<"Enable basic block tracing in sanitizer coverage">, MarshallingInfoFlag>; def fsanitize_coverage_trace_cmp : Flag<["-"], "fsanitize-coverage-trace-cmp">, + Group, HelpText<"Enable cmp instruction tracing in sanitizer coverage">, MarshallingInfoFlag>; def fsanitize_coverage_trace_div : Flag<["-"], "fsanitize-coverage-trace-div">, + Group, HelpText<"Enable div instruction tracing in sanitizer coverage">, MarshallingInfoFlag>; def fsanitize_coverage_trace_gep : Flag<["-"], "fsanitize-coverage-trace-gep">, + Group, HelpText<"Enable gep instruction tracing in sanitizer coverage">, MarshallingInfoFlag>; def fsanitize_coverage_8bit_counters : Flag<["-"], "fsanitize-coverage-8bit-counters">, + Group, HelpText<"Enable frequency counters in sanitizer coverage">, MarshallingInfoFlag>; def fsanitize_coverage_inline_8bit_counters : Flag<["-"], "fsanitize-coverage-inline-8bit-counters">, + Group, HelpText<"Enable inline 8-bit counters in sanitizer coverage">, MarshallingInfoFlag>; def fsanitize_coverage_inline_bool_flag : Flag<["-"], "fsanitize-coverage-inline-bool-flag">, + Group, HelpText<"Enable inline bool flag in sanitizer coverage">, MarshallingInfoFlag>; def fsanitize_coverage_pc_table : Flag<["-"], "fsanitize-coverage-pc-table">, + Group, HelpText<"Create a table of coverage-instrumented PCs">, MarshallingInfoFlag>; def fsanitize_coverage_control_flow : Flag<["-"], "fsanitize-coverage-control-flow">, + Group, HelpText<"Collect control flow of function">, MarshallingInfoFlag>; def fsanitize_coverage_trace_pc : Flag<["-"], "fsanitize-coverage-trace-pc">, + Group, HelpText<"Enable PC tracing in sanitizer coverage">, MarshallingInfoFlag>; def fsanitize_coverage_trace_pc_guard : Flag<["-"], "fsanitize-coverage-trace-pc-guard">, + Group, HelpText<"Enable PC tracing with guard in sanitizer coverage">, MarshallingInfoFlag>; def fsanitize_coverage_no_prune : Flag<["-"], "fsanitize-coverage-no-prune">, + Group, HelpText<"Disable coverage pruning (i.e. instrument all blocks/edges)">, MarshallingInfoFlag>; def fsanitize_coverage_stack_depth : Flag<["-"], "fsanitize-coverage-stack-depth">, + Group, HelpText<"Enable max stack depth tracing">, MarshallingInfoFlag>; def fsanitize_coverage_trace_loads : Flag<["-"], "fsanitize-coverage-trace-loads">, + Group, HelpText<"Enable tracing of loads">, MarshallingInfoFlag>; def fsanitize_coverage_trace_stores : Flag<["-"], "fsanitize-coverage-trace-stores">, + Group, HelpText<"Enable tracing of stores">, MarshallingInfoFlag>; def fexperimental_sanitize_metadata_EQ_covered diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index 9dc2c6ce39ae4..80e58d466b885 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -1074,24 +1074,9 @@ ROCMToolChain::getCommonDeviceLibNames( bool AMDGPUToolChain::shouldSkipSanitizeOption( const ToolChain &TC, const llvm::opt::ArgList &DriverArgs, StringRef TargetID, const llvm::opt::Arg *A) const { - // For actions without targetID, do nothing. - if (TargetID.empty()) - return false; - Option O = A->getOption(); - - if (!O.matches(options::OPT_fsanitize_EQ)) - return false; - - if (!DriverArgs.hasFlag(options::OPT_fgpu_sanitize, - options::OPT_fno_gpu_sanitize, true)) - return true; - auto &Diags = TC.getDriver().getDiags(); - - // For simplicity, we only allow -fsanitize=address - SanitizerMask K = parseSanitizerValue(A->getValue(), /*AllowGroups=*/false); - if (K != SanitizerKind::Address) - return true; + bool IsExplicitDevice = + A->getBaseArg().getOption().matches(options::OPT_Xarch_device); // Check 'xnack+' availability by default llvm::StringRef Processor = @@ -1112,10 +1097,17 @@ bool AMDGPUToolChain::shouldSkipSanitizeOption( (void)OptionalGpuArch; auto Loc = FeatureMap.find("xnack"); if (Loc == FeatureMap.end() || !Loc->second) { - Diags.Report( - clang::diag::warn_drv_unsupported_option_for_offload_arch_req_feature) - << A->getAsString(DriverArgs) << TargetID << "xnack+"; + if (IsExplicitDevice) { + Diags.Report( + clang::diag::err_drv_unsupported_option_for_offload_arch_req_feature) + << A->getAsString(DriverArgs) << TargetID << "xnack+"; + } else { + Diags.Report( + clang::diag::warn_drv_unsupported_option_for_offload_arch_req_feature) + << A->getAsString(DriverArgs) << TargetID << "xnack+"; + } return true; } + return false; } diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h index 7b999c311154f..4dd8188842f83 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.h +++ b/clang/lib/Driver/ToolChains/AMDGPU.h @@ -101,7 +101,7 @@ class LLVM_LIBRARY_VISIBILITY AMDGPUToolChain : public Generic_ELF { /// Needed for translating LTO options. const char *getDefaultLinker() const override { return "ld.lld"; } - /// Should skip sanitize options. + /// Should skip sanitize option. bool shouldSkipSanitizeOption(const ToolChain &TC, const llvm::opt::ArgList &DriverArgs, StringRef TargetID, @@ -155,18 +155,79 @@ class LLVM_LIBRARY_VISIBILITY ROCMToolChain : public AMDGPUToolChain { return SanitizerKind::Address; } - void diagnoseUnsupportedSanitizers(const llvm::opt::ArgList &Args) const { - if (!Args.hasFlag(options::OPT_fgpu_sanitize, options::OPT_fno_gpu_sanitize, - true)) - return; + bool diagnoseUnsupportedOption(const llvm::opt::Arg *A, + const llvm::opt::DerivedArgList &DAL, + const llvm::opt::ArgList &DriverArgs, + const char *Value = nullptr) const { auto &Diags = getDriver().getDiags(); - for (auto *A : Args.filtered(options::OPT_fsanitize_EQ)) { - SanitizerMask K = - parseSanitizerValue(A->getValue(), /*Allow Groups*/ false); - if (K != SanitizerKind::Address) - Diags.Report(clang::diag::warn_drv_unsupported_option_for_target) - << A->getAsString(Args) << getTriple().str(); + bool IsExplicitDevice = + A->getBaseArg().getOption().matches(options::OPT_Xarch_device); + + if (Value) { + unsigned DiagID = + IsExplicitDevice + ? clang::diag::err_drv_unsupported_option_part_for_target + : clang::diag::warn_drv_unsupported_option_part_for_target; + Diags.Report(DiagID) << Value << A->getAsString(DriverArgs) + << getTriple().str(); + } else { + unsigned DiagID = + IsExplicitDevice + ? clang::diag::err_drv_unsupported_option_for_target + : clang::diag::warn_drv_unsupported_option_for_target; + Diags.Report(DiagID) << A->getAsString(DAL) << getTriple().str(); } + return true; + } + + bool handleSanitizeOption(const ToolChain &TC, llvm::opt::DerivedArgList &DAL, + const llvm::opt::ArgList &DriverArgs, + StringRef TargetID, const llvm::opt::Arg *A) const { + if (TargetID.empty()) + return false; + // If we shouldn't do sanitizing, skip it. + if (!DriverArgs.hasFlag(options::OPT_fgpu_sanitize, + options::OPT_fno_gpu_sanitize, true)) + return true; + const llvm::opt::Option &Opt = A->getOption(); + // Sanitizer coverage is currently not supported for AMDGPU, so warn/error + // on every related option. + if (Opt.matches(options::OPT_fsan_cov_Group)) { + diagnoseUnsupportedOption(A, DAL, DriverArgs); + } + // If this isn't a sanitizer option, don't handle it. + if (!Opt.matches(options::OPT_fsanitize_EQ)) + return false; + + SmallVector SupportedSanitizers; + SmallVector UnSupportedSanitizers; + + for (const char *Value : A->getValues()) { + SanitizerMask K = parseSanitizerValue(Value, /*Allow Groups*/ false); + if (K & ROCMToolChain::getSupportedSanitizers()) + SupportedSanitizers.push_back(Value); + else + UnSupportedSanitizers.push_back(Value); + } + + // If there are no supported sanitizers, drop the whole argument. + if (SupportedSanitizers.empty()) { + diagnoseUnsupportedOption(A, DAL, DriverArgs); + return true; + } + // If only some sanitizers are unsupported, report each one individually. + if (!UnSupportedSanitizers.empty()) { + for (const char *Value : UnSupportedSanitizers) { + diagnoseUnsupportedOption(A, DAL, DriverArgs, Value); + } + } + // If we know the target arch, check if the sanitizer is supported for it. + if (shouldSkipSanitizeOption(TC, DriverArgs, TargetID, A)) + return true; + + // Add a new argument with only the supported sanitizers. + DAL.AddJoinedArg(A, A->getOption(), llvm::join(SupportedSanitizers, ",")); + return true; } }; diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp index e14bc574d139a..1a30875807d30 100644 --- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp @@ -28,8 +28,6 @@ AMDGPUOpenMPToolChain::AMDGPUOpenMPToolChain(const Driver &D, // Lookup binaries into the driver directory, this is used to // discover the 'amdgpu-arch' executable. getProgramPaths().push_back(getDriver().Dir); - // Diagnose unsupported sanitizer options only once. - diagnoseUnsupportedSanitizers(Args); } void AMDGPUOpenMPToolChain::addClangTargetOptions( @@ -66,16 +64,11 @@ llvm::opt::DerivedArgList *AMDGPUOpenMPToolChain::TranslateArgs( const OptTable &Opts = getDriver().getOpts(); - // Skip sanitize options passed from the HostTC. Claim them early. - // The decision to sanitize device code is computed only by - // 'shouldSkipSanitizeOption'. - if (DAL->hasArg(options::OPT_fsanitize_EQ)) - DAL->claimAllArgs(options::OPT_fsanitize_EQ); - - for (Arg *A : Args) - if (!shouldSkipSanitizeOption(*this, Args, BoundArch, A) && - !llvm::is_contained(*DAL, A)) + for (Arg *A : Args) { + // Filter unsupported sanitizers passed from the HostTC. + if (!handleSanitizeOption(*this, *DAL, Args, BoundArch, A)) DAL->append(A); + } if (!BoundArch.empty()) { DAL->eraseArg(options::OPT_march_EQ); @@ -115,9 +108,8 @@ void AMDGPUOpenMPToolChain::AddIAMCUIncludeArgs(const ArgList &Args, SanitizerMask AMDGPUOpenMPToolChain::getSupportedSanitizers() const { // The AMDGPUOpenMPToolChain only supports sanitizers in the sense that it // allows sanitizer arguments on the command line if they are supported by the - // host toolchain. The AMDGPUOpenMPToolChain will actually ignore any command - // line arguments for any of these "supported" sanitizers. That means that no - // sanitization of device code is actually supported at this time. + // host toolchain. The AMDGPUOpenMPToolChain will later filter unsupported + // sanitizers from the command line arguments. // // This behavior is necessary because the host and device toolchains // invocations often share the command line, so the device toolchain must diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp index 0fbfa090ed9d3..231a38c2d3717 100644 --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp @@ -219,8 +219,6 @@ HIPAMDToolChain::HIPAMDToolChain(const Driver &D, const llvm::Triple &Triple, // Lookup binaries into the driver directory, this is used to // discover the clang-offload-bundler executable. getProgramPaths().push_back(getDriver().Dir); - // Diagnose unsupported sanitizer options only once. - diagnoseUnsupportedSanitizers(Args); } void HIPAMDToolChain::addClangTargetOptions( @@ -292,7 +290,8 @@ HIPAMDToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args, const OptTable &Opts = getDriver().getOpts(); for (Arg *A : Args) { - if (!shouldSkipSanitizeOption(*this, Args, BoundArch, A)) + // Filter unsupported sanitizers passed from the HostTC. + if (!handleSanitizeOption(*this, *DAL, Args, BoundArch, A)) DAL->append(A); } @@ -348,9 +347,8 @@ void HIPAMDToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs, SanitizerMask HIPAMDToolChain::getSupportedSanitizers() const { // The HIPAMDToolChain only supports sanitizers in the sense that it allows // sanitizer arguments on the command line if they are supported by the host - // toolchain. The HIPAMDToolChain will actually ignore any command line - // arguments for any of these "supported" sanitizers. That means that no - // sanitization of device code is actually supported at this time. + // toolchain. The HIPAMDToolChain will later filter unsupported sanitizers + // from the command line arguments. // // This behavior is necessary because the host and device toolchains // invocations often share the command line, so the device toolchain must diff --git a/clang/test/Driver/amdgpu-openmp-sanitize-options.c b/clang/test/Driver/amdgpu-openmp-sanitize-options.c index 10d64984918e6..fd7d11803249c 100644 --- a/clang/test/Driver/amdgpu-openmp-sanitize-options.c +++ b/clang/test/Driver/amdgpu-openmp-sanitize-options.c @@ -52,6 +52,48 @@ // RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ --offload-arch=gfx900:xnack+ -fsanitize=address -fno-gpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ // RUN: | FileCheck -check-prefixes=HOSTSAN,NOGPUSAN,SAN %s +// Catch invalid combination of sanitizers regardless of their order and ignore +// them selectively. +// (The address sanitizer enables the device sanitizer pipeline. The fuzzer +// implicitly turns on LLVMs SanitizerCoverage, which the driver then forwards +// to the device cc1. SanitizerCoverage is not supported on amdgcn.) + +// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=address,fuzzer --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=HOSTSANCOMBINATION,INVALIDCOMBINATION1 %s +// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=fuzzer,address --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=HOSTSANCOMBINATION,INVALIDCOMBINATION2 %s + +// Do the same for multiple -fsanitize arguments and multi-arch scenarios. + +// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ --offload-arch=gfx900:xnack- -fsanitize=address,fuzzer --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=HOSTSANCOMBINATION,INVALIDCOMBINATION1 %s +// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+,gfx900:xnack- -fsanitize=address,fuzzer --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=HOSTSANCOMBINATION,INVALIDCOMBINATION1 %s +// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+,gfx900:xnack- -fsanitize=fuzzer,address -fsanitize=leak --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=HOSTSANCOMBINATION2,NOTSUPPORTED-DAG,INVALIDCOMBINATION2 %s + +// Check for -fsanitize-coverage options +// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=address -fsanitize-coverage=inline-bool-flag --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=WARNSANCOV %s + +// Test -Xarch_device error scenario + +// RUN: not %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -Xarch_device -fsanitize=leak --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=UNSUPPORTEDERROR %s + +// RUN: not %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack- -Xarch_device -fsanitize=address --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=XNACKERROR %s + +// RUN: not %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -Xarch_device -fsanitize=fuzzer,address --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=INVALIDCOMBINATIONERROR %s + +// RUN: not %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=address -Xarch_device -fsanitize-coverage-stack-depth-callback-min=42 --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=ERRSANCOV %s + + +// INVALIDCOMBINATION1: warning: ignoring 'fuzzer' in '-fsanitize=address,fuzzer' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] +// INVALIDCOMBINATION2: warning: ignoring 'fuzzer' in '-fsanitize=fuzzer,address' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] + // FAIL-DAG: error: cannot find ROCm device library for ABI version 5; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library // NOTSUPPORTED-DAG: warning: ignoring '-fsanitize=leak' option as it is not currently supported for target 'amdgcn-amd-amdhsa' @@ -59,6 +101,8 @@ // XNACKNEG: warning: ignoring '-fsanitize=address' option for offload arch 'gfx908:xnack-' as it is not currently supported there. Use it with an offload arch containing 'xnack+' instead // HOSTSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "--offload-targets=amdgcn-amd-amdhsa".* "-x" "c".*}} +// HOSTSANCOMBINATION: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address,fuzzer,fuzzer-no-link".* "--offload-targets=amdgcn-amd-amdhsa".* "-x" "c".*}} +// HOSTSANCOMBINATION2: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address,fuzzer,fuzzer-no-link,leak".* "--offload-targets=amdgcn-amd-amdhsa".* "-x" "c".*}} // GPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu".* "-emit-llvm-bc".* "-mlink-bitcode-file" "[^"]*asanrtl.bc".* "-mlink-bitcode-file" "[^"]*ockl.bc".* "-target-cpu" "(gfx908|gfx900|gfx1250|gfx1251)".* "-fopenmp".* "-fsanitize=address".* "-x" "c".*}} // NOGPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu".* "-emit-llvm-bc".* "-target-cpu" "(gfx908|gfx900)".* "-fopenmp".* "-x" "c".*}} @@ -66,3 +110,10 @@ // SAN: {{"[^"]*llvm-offload-binary[^"]*" "-o".* "--image=file=.*.bc,triple=amdgcn-amd-amdhsa,arch=(gfx908|gfx1250|gfx1251)(:xnack\-|:xnack\+)?,kind=openmp(,feature=(\-xnack|\+xnack))?"}} // SAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "--offload-targets=amdgcn-amd-amdhsa".* "-x" "ir".*}} // SAN: {{"[^"]*clang-linker-wrapper[^"]*".* "--host-triple=x86_64-unknown-linux-gnu".* "--linker-path=[^"]*".* "--whole-archive" "[^"]*(libclang_rt.asan_static.a|libclang_rt.asan_static-x86_64.a)".* "--whole-archive" "[^"]*(libclang_rt.asan.a|libclang_rt.asan-x86_64.a)".*}} + +// UNSUPPORTEDERROR: error: '-fsanitize=leak' option is not currently supported for target 'amdgcn-amd-amdhsa' +// XNACKERROR: error: '-fsanitize=address' option for offload arch 'gfx908:xnack-' is not currently supported there. Use it with an offload arch containing 'xnack+' instead +// INVALIDCOMBINATIONERROR: error: 'fuzzer' in '-fsanitize=fuzzer,address' option is not currently supported for target 'amdgcn-amd-amdhsa' + +// WARNSANCOV: warning: ignoring '-fsanitize-coverage=inline-bool-flag' option as it is not currently supported for target 'amdgcn-amd-amdhsa' +// ERRSANCOV: error: '-fsanitize-coverage-stack-depth-callback-min=42' option is not currently supported for target 'amdgcn-amd-amdhsa' diff --git a/clang/test/Driver/hip-sanitize-options.hip b/clang/test/Driver/hip-sanitize-options.hip index 490385173a4cb..d436756ee046b 100644 --- a/clang/test/Driver/hip-sanitize-options.hip +++ b/clang/test/Driver/hip-sanitize-options.hip @@ -52,6 +52,51 @@ // RUN: -fsanitize=leak -nogpuinc --rocm-path=%S/Inputs/rocm \ // RUN: %s 2>&1 | FileCheck -check-prefixes=NOGPUNEG %s +// Catch invalid combination of sanitizers regardless of their order and ignore +// them selectively. +// (The address sanitizer enables the device sanitizer pipeline. The fuzzer +// implicitly turns on LLVMs SanitizerCoverage, which the driver then forwards +// to the device cc1. SanitizerCoverage is not supported on amdgcn.) + +// RUN: %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+ \ +// RUN: -fsanitize=address,fuzzer --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=INVALIDCOMBINATION,INVALIDCOMBINATION1 %s +// RUN: %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+ \ +// RUN: -fsanitize=fuzzer,address --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=INVALIDCOMBINATION,INVALIDCOMBINATION2 %s + +// Do the same for multiple -fsanitize arguments and multi-arch scenarios. + +// RUN: %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+ --offload-arch=gfx908:xnack- \ +// RUN: -fsanitize=address,fuzzer -fsanitize=leak --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=MULT1,XNACK2 %s +// RUN: %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+,gfx908:xnack- \ +// RUN: -fsanitize=fuzzer,address -fsanitize=leak --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=MULT2,XNACK2 %s + +// Check for -fsanitize-coverage options +// RUN: %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+ \ +// RUN: -fsanitize=address -fsanitize-coverage=inline-bool-flag --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=WARNSANCOV %s + +// Test -Xarch_device error scenario + +// RUN: not %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+ \ +// RUN: -Xarch_device -fsanitize=leak --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=UNSUPPORTEDERROR %s + +// RUN: not %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack- \ +// RUN: -Xarch_device -fsanitize=address --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=XNACKERROR %s + +// RUN: not %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+ \ +// RUN: -Xarch_device -fsanitize=fuzzer,address --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=INVALIDCOMBINATIONERROR %s + +// RUN: not %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+ \ +// RUN: -fsanitize=address -Xarch_device -fsanitize-coverage-stack-depth-callback-min=42 --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=ERRSANCOV %s + // CHECK-NOT: {{"[^"]*clang[^"]*".* "-fcuda-is-device".* "-fsanitize=address"}} // CHECK-NOT: {{"[^"]*clang[^"]*".* "-fcuda-is-device".* "-mlink-bitcode-file" ".*asanrtl.bc"}} // CHECK-NOT: {{"[^"]*lld(\.exe){0,1}".* ".*hip.bc"}} @@ -101,3 +146,31 @@ // NOGPUNEG-NOT: {{"[^"]*clang[^"]*".* "-mlink-bitcode-file" ".*asanrtl.bc".* "-target-cpu" "gfx900".* "-target-feature" "-xnack"}} // NOGPUNEG-NOT: {{"[^"]*clang[^"]*".* "-mlink-bitcode-file" ".*asanrtl.bc".* "-target-cpu" "gfx906"}} // NOGPUNEG-NOT: {{"[^"]*lld(\.exe){0,1}".* ".*hip.bc"}} + +// INVALIDCOMBINATION1-DAG: warning: ignoring 'fuzzer' in '-fsanitize=address,fuzzer' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] +// INVALIDCOMBINATION2-DAG: warning: ignoring 'fuzzer' in '-fsanitize=fuzzer,address' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] +// INVALIDCOMBINATION-DAG: {{"[^"]*clang[^"]*".* "-mlink-bitcode-file" ".*asanrtl.bc".* "-target-cpu" "gfx900".* "-target-feature" "\+xnack".* "-fsanitize=address"}} +// INVALIDCOMBINATION-DAG: {{"[^"]*clang[^"]*".* "-triple" "x86_64-unknown-linux-gnu".* "-fsanitize=address,fuzzer,fuzzer-no-link"}} + +// MULT1-DAG: warning: ignoring 'fuzzer' in '-fsanitize=address,fuzzer' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] +// MULT1-DAG: warning: ignoring '-fsanitize=leak' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] +// MULT1-DAG: warning: ignoring 'fuzzer' in '-fsanitize=address,fuzzer' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] +// MULT1-DAG: warning: ignoring '-fsanitize=address,fuzzer' option for offload arch 'gfx908:xnack-' as it is not currently supported there. Use it with an offload arch containing 'xnack+' instead [-Woption-ignored] +// MULT1-DAG: warning: ignoring '-fsanitize=leak' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] + +// MULT2-DAG: warning: ignoring 'fuzzer' in '-fsanitize=fuzzer,address' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] +// MULT2-DAG: warning: ignoring '-fsanitize=leak' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] +// MULT2-DAG: warning: ignoring 'fuzzer' in '-fsanitize=fuzzer,address' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] +// MULT2-DAG: warning: ignoring '-fsanitize=fuzzer,address' option for offload arch 'gfx908:xnack-' as it is not currently supported there. Use it with an offload arch containing 'xnack+' instead [-Woption-ignored] +// MULT2-DAG: warning: ignoring '-fsanitize=leak' option as it is not currently supported for target 'amdgcn-amd-amdhsa' [-Woption-ignored] + +// XNACK2-DAG: {{"[^"]*clang[^"]*".* "-mlink-bitcode-file" ".*asanrtl.bc".* "-target-cpu" "gfx900".* "-target-feature" "\+xnack".* "-fsanitize=address"}} +// XNACK2-DAG: {{"[^"]*clang[^"]*".* "-target-cpu" "gfx908"}} +// XNACK2-DAG: {{"[^"]*clang[^"]*".* "-triple" "x86_64-unknown-linux-gnu".* "-fsanitize=address,fuzzer,fuzzer-no-link,leak"}} + +// UNSUPPORTEDERROR: error: '-fsanitize=leak' option is not currently supported for target 'amdgcn-amd-amdhsa' +// XNACKERROR: error: '-fsanitize=address' option for offload arch 'gfx900:xnack-' is not currently supported there. Use it with an offload arch containing 'xnack+' instead +// INVALIDCOMBINATIONERROR: error: 'fuzzer' in '-fsanitize=fuzzer,address' option is not currently supported for target 'amdgcn-amd-amdhsa' + +// WARNSANCOV: warning: ignoring '-fsanitize-coverage=inline-bool-flag' option as it is not currently supported for target 'amdgcn-amd-amdhsa' +// ERRSANCOV: error: '-fsanitize-coverage-stack-depth-callback-min=42' option is not currently supported for target 'amdgcn-amd-amdhsa' From 20795e06ed8b91109d5237ffb02f37245d1f781d Mon Sep 17 00:00:00 2001 From: Pierre van Houtryve Date: Tue, 18 Nov 2025 10:12:56 +0100 Subject: [PATCH 12/35] [AMDGPU][SIMemoryLegalizer] Combine GFX10-11 CacheControl Classes (#168058) Also breaks the long inheritance chains by making both `SIGfx10CacheControl` and `SIGfx12CacheControl` inherit from `SICacheControl` directly. With this patch, we now just have 3 `SICacheControl` implementations that each do their own thing, and there is no more code hidden 3 superclasses above (which made this code harder to read and maintain than it needed to be). --- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 158 +++++-------------- 1 file changed, 38 insertions(+), 120 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 49aba39872138..bf04c7fa132c0 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -404,7 +404,7 @@ class SICacheControl { /// Generates code sequences for the memory model of all GFX targets below /// GFX10. -class SIGfx6CacheControl : public SICacheControl { +class SIGfx6CacheControl final : public SICacheControl { public: SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} @@ -443,14 +443,27 @@ class SIGfx6CacheControl : public SICacheControl { Position Pos) const override; }; -class SIGfx10CacheControl : public SIGfx6CacheControl { +/// Generates code sequences for the memory model of GFX10/11. +class SIGfx10CacheControl final : public SICacheControl { public: - SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {} + SIGfx10CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {} bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const override; + bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override { + return false; + } + + bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override { + return false; + } + bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsVolatile, bool IsNonTemporal, @@ -463,23 +476,17 @@ class SIGfx10CacheControl : public SIGfx6CacheControl { bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const override; -}; - -class SIGfx11CacheControl : public SIGfx10CacheControl { -public: - SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {} - bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const override; - - bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, - SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, - bool IsLastUse) const override; + bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, + Position Pos) const override { + return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, + IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release, + /*AtomicsOnly=*/false); + } }; -class SIGfx12CacheControl : public SIGfx11CacheControl { +class SIGfx12CacheControl final : public SICacheControl { protected: // Sets TH policy to \p Value if CPol operand is present in instruction \p MI. // \returns Returns true if \p MI is modified, false otherwise. @@ -504,7 +511,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const; public: - SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) { + SIGfx12CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) { // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases // the behavior is the same if assuming GFX12.0 in CU mode. assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled()); @@ -915,10 +922,8 @@ std::unique_ptr SICacheControl::create(const GCNSubtarget &ST) { GCNSubtarget::Generation Generation = ST.getGeneration(); if (Generation < AMDGPUSubtarget::GFX10) return std::make_unique(ST); - if (Generation < AMDGPUSubtarget::GFX11) - return std::make_unique(ST); if (Generation < AMDGPUSubtarget::GFX12) - return std::make_unique(ST); + return std::make_unique(ST); return std::make_unique(ST); } @@ -1438,8 +1443,7 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI, } bool SIGfx10CacheControl::enableLoadCacheBypass( - const MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, + const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { assert(MI->mayLoad() && !MI->mayStore()); bool Changed = false; @@ -1450,7 +1454,9 @@ bool SIGfx10CacheControl::enableLoadCacheBypass( case SIAtomicScope::AGENT: // Set the L0 and L1 cache policies to MISS_EVICT. // Note: there is no L2 cache coherent bypass control at the ISA level. - Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC); + // For GFX10, set GLC+DLC, for GFX11, only set GLC. + Changed |= + enableCPolBits(MI, CPol::GLC | (AMDGPU::isGFX10(ST) ? CPol::DLC : 0)); break; case SIAtomicScope::WORKGROUP: // In WGP mode the waves of a work-group can be executing on either CU of @@ -1504,6 +1510,10 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC); } + // GFX11: Set MALL NOALLOC for both load and store instructions. + if (AMDGPU::isGFX11(ST)) + Changed |= enableCPolBits(MI, CPol::DLC); + // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not // request cross address space as only the global address space can be @@ -1524,6 +1534,10 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( Changed |= enableCPolBits(MI, CPol::GLC); Changed |= enableCPolBits(MI, CPol::SLC); + // GFX11: Set MALL NOALLOC for both load and store instructions. + if (AMDGPU::isGFX11(ST)) + Changed |= enableCPolBits(MI, CPol::DLC); + return Changed; } @@ -1722,102 +1736,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return Changed; } -bool SIGfx11CacheControl::enableLoadCacheBypass( - const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace) const { - assert(MI->mayLoad() && !MI->mayStore()); - bool Changed = false; - - if (canAffectGlobalAddrSpace(AddrSpace)) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - case SIAtomicScope::AGENT: - // Set the L0 and L1 cache policies to MISS_EVICT. - // Note: there is no L2 cache coherent bypass control at the ISA level. - Changed |= enableCPolBits(MI, CPol::GLC); - break; - case SIAtomicScope::WORKGROUP: - // In WGP mode the waves of a work-group can be executing on either CU of - // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in - // CU mode all waves of a work-group are on the same CU, and so the L0 - // does not need to be bypassed. - if (!ST.isCuModeEnabled()) - Changed |= enableCPolBits(MI, CPol::GLC); - break; - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No cache to bypass. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - /// The scratch address space does not need the global memory caches - /// to be bypassed as all memory operations by the same thread are - /// sequentially consistent, and no other thread can access scratch - /// memory. - - /// Other address spaces do not have a cache. - - return Changed; -} - -bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( - MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { - - // Only handle load and store, not atomic read-modify-write insructions. The - // latter use glc to indicate if the atomic returns a result and so must not - // be used for cache control. - assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI)); - - // Only update load and store, not LLVM IR atomic read-modify-write - // instructions. The latter are always marked as volatile so cannot sensibly - // handle it as do not want to pessimize all atomics. Also they do not support - // the nontemporal attribute. - assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); - - bool Changed = false; - - if (IsVolatile) { - // Set L0 and L1 cache policy to be MISS_EVICT for load instructions - // and MISS_LRU for store instructions. - // Note: there is no L2 cache coherent bypass control at the ISA level. - if (Op == SIMemOp::LOAD) - Changed |= enableCPolBits(MI, CPol::GLC); - - // Set MALL NOALLOC for load and store instructions. - Changed |= enableCPolBits(MI, CPol::DLC); - - // Ensure operation has completed at system scope to cause all volatile - // operations to be visible outside the program in a global order. Do not - // request cross address space as only the global address space can be - // observable outside the program, so no need to cause a waitcnt for LDS - // address space operations. - Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, - Position::AFTER, AtomicOrdering::Unordered, - /*AtomicsOnly=*/false); - return Changed; - } - - if (IsNonTemporal) { - // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT - // and L2 cache policy to STREAM. - // For stores setting both GLC and SLC configures L0 and L1 cache policy - // to MISS_EVICT and the L2 cache policy to STREAM. - if (Op == SIMemOp::STORE) - Changed |= enableCPolBits(MI, CPol::GLC); - Changed |= enableCPolBits(MI, CPol::SLC); - - // Set MALL NOALLOC for load and store instructions. - Changed |= enableCPolBits(MI, CPol::DLC); - return Changed; - } - - return Changed; -} - bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI, AMDGPU::CPol::CPol Value) const { MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); From beb06eb884be10449b515f215c859fd432fb2635 Mon Sep 17 00:00:00 2001 From: Katya Romanova <56653669+romanova-ekaterina@users.noreply.github.com> Date: Tue, 18 Nov 2025 01:24:38 -0800 Subject: [PATCH 13/35] Fixed 2 tests that failed on MacOS (#168482) 1. Fixed 2 DTLTO cache tests that failed on MacOS because input to grep command is different compared to Windows 2. Removed unneeded comments from dtlto-cache.ll --- cross-project-tests/dtlto/dtlto-cache.test | 8 ++++---- cross-project-tests/dtlto/dtlto-thinlto-cache.test | 4 ++-- llvm/test/ThinLTO/X86/dtlto/dtlto-cache.ll | 7 ------- 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/cross-project-tests/dtlto/dtlto-cache.test b/cross-project-tests/dtlto/dtlto-cache.test index b98d4dbb433bb..5dd67a50ab2c3 100644 --- a/cross-project-tests/dtlto/dtlto-cache.test +++ b/cross-project-tests/dtlto/dtlto-cache.test @@ -17,7 +17,7 @@ RUN: -Wl,--thinlto-cache-dir=cache.dir \ RUN: -Wl,--save-temps # Check that there are two backend compilation jobs occurred. -RUN: grep -wo args populate1.*.dist-file.json | wc -l | grep -qx 3 +RUN: grep -wo args populate1.*.dist-file.json | wc -l | grep -qx "\s*3" RUN: ls cache.dir/llvmcache.timestamp RUN: ls cache.dir | count 3 @@ -32,7 +32,7 @@ RUN: -Wl,--thinlto-cache-dir=cache.dir \ RUN: -Wl,--save-temps # Check that there are no backend compilation jobs occurred. -RUN: grep -wo args populate2.*.dist-file.json | wc -l | grep -qx 1 +RUN: grep -wo args populate2.*.dist-file.json | wc -l | grep -qx "\s*1" RUN: ls cache.dir | count 3 RUN: %clang -O0 --target=x86_64-linux-gnu -flto=thin -c foo.c -o foo.O0.o @@ -52,7 +52,7 @@ RUN: -Wl,--thinlto-cache-dir=cache.dir \ RUN: -Wl,--save-temps # Check that there are two new backend compilation jobs occurred. -RUN: grep -wo args populate3.*.dist-file.json | wc -l | grep -qx 3 +RUN: grep -wo args populate3.*.dist-file.json | wc -l | grep -qx "\s*3" RUN: ls cache.dir | count 5 RUN: %clang -O2 --target=x86_64-linux-gnu -flto=thin -c main-partial.c @@ -69,7 +69,7 @@ RUN: -Wl,--thinlto-cache-dir=cache.dir \ RUN: -Wl,--save-temps # Check that there is one new backend compilation jobs occurred. -RUN: grep -wo args main-partial.*.dist-file.json | wc -l | grep -qx 2 +RUN: grep -wo args main-partial.*.dist-file.json | wc -l | grep -qx "\s*2" RUN: ls cache.dir | count 6 #--- foo.c diff --git a/cross-project-tests/dtlto/dtlto-thinlto-cache.test b/cross-project-tests/dtlto/dtlto-thinlto-cache.test index c177112e2dbbd..9b0ca228480d1 100644 --- a/cross-project-tests/dtlto/dtlto-thinlto-cache.test +++ b/cross-project-tests/dtlto/dtlto-thinlto-cache.test @@ -29,7 +29,7 @@ RUN: -Wl,--thinlto-cache-dir=cache.dir \ RUN: -Wl,--save-temps # Check that there are two backend compilation jobs occurred. -RUN: grep -wo args populate1.*.dist-file.json | wc -l | grep -qx 3 +RUN: grep -wo args populate1.*.dist-file.json | wc -l | grep -qx "\s*3" RUN: ls cache.dir | count 5 # Clean up cache directory. @@ -45,7 +45,7 @@ RUN: -Wl,--thinlto-cache-dir=cache.dir \ RUN: -Wl,--save-temps # Check that there are two backend compilation jobs occurred. -RUN: grep -wo args populate2.*.dist-file.json | wc -l | grep -qx 3 +RUN: grep -wo args populate2.*.dist-file.json | wc -l | grep -qx "\s*3" RUN: ls cache.dir/llvmcache.timestamp RUN: ls cache.dir | count 3 diff --git a/llvm/test/ThinLTO/X86/dtlto/dtlto-cache.ll b/llvm/test/ThinLTO/X86/dtlto/dtlto-cache.ll index df98c5e90b1ae..129093452101d 100644 --- a/llvm/test/ThinLTO/X86/dtlto/dtlto-cache.ll +++ b/llvm/test/ThinLTO/X86/dtlto/dtlto-cache.ll @@ -43,18 +43,11 @@ THINLTO-DAG: {{^}}t.o.2{{$}} RUN: %{command} -; Check that the expected output files have been created. RUN: ls | count 3 -; Check that two native object files has been created RUN: ls | FileCheck %s --check-prefix=THINLTO -; Check that DTLTO cache directory has been created RUN: ls cache-dir/* | count 2 -; Check that 2 cache entries are created RUN: ls cache-dir/llvmcache-* | count 2 - - - ;--- t1.ll target triple = "x86_64-unknown-linux-gnu" From 603ac57ef9768dc557d223f61c5e3c5cb5e50a12 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Tue, 18 Nov 2025 09:26:21 +0000 Subject: [PATCH 14/35] [AArch64][SME] Add support for zeroing ZT0 to CommitZASavePseudo (#166360) This will be used to support ZT0 in the MachineSMEABIPass. --- .../AArch64/AArch64ExpandPseudoInsts.cpp | 13 ++-- .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 3 +- llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 1 + .../CodeGen/AArch64/expand-sme-pseudos.mir | 69 ++++++++++++++++++- 4 files changed, 79 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index f63981b87c1c1..34d74d04c4419 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1063,6 +1063,7 @@ AArch64ExpandPseudo::expandCommitZASave(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) { MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); + [[maybe_unused]] auto *RI = MBB.getParent()->getSubtarget().getRegisterInfo(); // Compare TPIDR2_EL0 against 0. Commit ZA if TPIDR2_EL0 is non-zero. MachineInstrBuilder Branch = @@ -1073,21 +1074,25 @@ AArch64ExpandPseudo::expandCommitZASave(MachineBasicBlock &MBB, MachineInstrBuilder MIB = BuildMI(CondBB, CondBB.back(), DL, TII->get(AArch64::BL)); // Copy operands (mainly the regmask) from the pseudo. - for (unsigned I = 2; I < MI.getNumOperands(); ++I) + for (unsigned I = 3; I < MI.getNumOperands(); ++I) MIB.add(MI.getOperand(I)); // Clear TPIDR2_EL0. BuildMI(CondBB, CondBB.back(), DL, TII->get(AArch64::MSR)) .addImm(AArch64SysReg::TPIDR2_EL0) .addReg(AArch64::XZR); bool ZeroZA = MI.getOperand(1).getImm() != 0; + bool ZeroZT0 = MI.getOperand(2).getImm() != 0; if (ZeroZA) { - [[maybe_unused]] auto *TRI = - MBB.getParent()->getSubtarget().getRegisterInfo(); - assert(MI.definesRegister(AArch64::ZAB0, TRI) && "should define ZA!"); + assert(MI.definesRegister(AArch64::ZAB0, RI) && "should define ZA!"); BuildMI(CondBB, CondBB.back(), DL, TII->get(AArch64::ZERO_M)) .addImm(ZERO_ALL_ZA_MASK) .addDef(AArch64::ZAB0, RegState::ImplicitDefine); } + if (ZeroZT0) { + assert(MI.definesRegister(AArch64::ZT0, RI) && "should define ZT0!"); + BuildMI(CondBB, CondBB.back(), DL, TII->get(AArch64::ZERO_T)) + .addDef(AArch64::ZT0); + } MI.eraseFromParent(); return &EndBB; diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 5bb70ee11b06d..737169253ddb3 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -108,7 +108,8 @@ def SMEStateAllocPseudo : Pseudo<(outs), (ins), []>, Sched<[]>; def CommitZASavePseudo : Pseudo<(outs), - (ins GPR64:$tpidr2_el0, i1imm:$zero_za, i64imm:$commit_routine, variable_ops), []>, + (ins GPR64:$tpidr2_el0, i1imm:$zero_za, i1imm:$zero_zt0, + i64imm:$commit_routine, variable_ops), []>, Sched<[]>; def AArch64_inout_za_use diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp index 24d30c731b945..2afbec92392f0 100644 --- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp @@ -842,6 +842,7 @@ void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB, BuildMI(MBB, MBBI, DL, TII->get(AArch64::CommitZASavePseudo)) .addReg(TPIDR2EL0) .addImm(ZeroZA ? 1 : 0) + .addImm(/*ZeroZT0=*/false) .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_TPIDR2_SAVE)) .addRegMask(TRI->SMEABISupportRoutinesCallPreservedMaskFromX0()); if (ZeroZA) diff --git a/llvm/test/CodeGen/AArch64/expand-sme-pseudos.mir b/llvm/test/CodeGen/AArch64/expand-sme-pseudos.mir index 6ca9b9b6cb200..9b745d56c4b7f 100644 --- a/llvm/test/CodeGen/AArch64/expand-sme-pseudos.mir +++ b/llvm/test/CodeGen/AArch64/expand-sme-pseudos.mir @@ -62,7 +62,7 @@ body: | ; CHECK-NEXT: RET undef $lr $x8 = MRS 56965, implicit-def $nzcv - CommitZASavePseudo $x8, 0, &__arm_tpidr2_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x0 + CommitZASavePseudo $x8, 0, 0, &__arm_tpidr2_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x0 RET_ReallyLR @@ -94,7 +94,72 @@ body: | ; CHECK-NEXT: RET undef $lr $x8 = MRS 56965, implicit-def $nzcv - CommitZASavePseudo $x8, 1, &__arm_tpidr2_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x0, implicit-def $zab0 + CommitZASavePseudo $x8, 1, 0, &__arm_tpidr2_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x0, implicit-def $zab0 + + RET_ReallyLR + +... +--- +# X8 = TPIDR2_EL0 +name: commit_za_save_zero_zt0 +alignment: 4 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: commit_za_save_zero_zt0 + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x8 = MRS 56965, implicit-def $nzcv + ; CHECK-NEXT: CBNZX $x8, %bb.1 + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $x8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BL &__arm_tpidr2_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x0, implicit-def $lr, implicit $sp, implicit-def $zt0 + ; CHECK-NEXT: MSR 56965, $xzr + ; CHECK-NEXT: $zt0 = ZERO_T + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .2: + ; CHECK-NEXT: RET undef $lr + $x8 = MRS 56965, implicit-def $nzcv + + CommitZASavePseudo $x8, 0, 1, &__arm_tpidr2_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x0, implicit-def $zt0 + + RET_ReallyLR + +... +--- +# X8 = TPIDR2_EL0 +name: commit_za_save_zero_everything +alignment: 4 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: commit_za_save_zero_everything + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x8 = MRS 56965, implicit-def $nzcv + ; CHECK-NEXT: CBNZX $x8, %bb.1 + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $x8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BL &__arm_tpidr2_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x0, implicit-def $lr, implicit $sp, implicit-def $zab0, implicit-def $zt0 + ; CHECK-NEXT: MSR 56965, $xzr + ; CHECK-NEXT: ZERO_M 255, implicit-def $zab0 + ; CHECK-NEXT: $zt0 = ZERO_T + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .2: + ; CHECK-NEXT: RET undef $lr + $x8 = MRS 56965, implicit-def $nzcv + + CommitZASavePseudo $x8, 1, 1, &__arm_tpidr2_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x0, implicit-def $zab0, implicit-def $zt0 RET_ReallyLR From 7c34848ae1405127a66abf10a963d8c7748f2e51 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 18 Nov 2025 09:35:48 +0000 Subject: [PATCH 15/35] [VPlan] Hoist loads with invariant addresses using noalias metadata. (#166247) This patch implements a transform to hoists single-scalar replicated loads with invariant addresses out of the vector loop to the preheader when scoped noalias metadata proves they cannot alias with any stores in the loop. This enables hosting of loads we can prove do not alias any stores in the loop due to memory runtime checks added during vectorization. PR: https://github.com/llvm/llvm-project/pull/166247 --- llvm/include/llvm/Analysis/ScopedNoAliasAA.h | 8 +- llvm/lib/Analysis/ScopedNoAliasAA.cpp | 4 +- llvm/lib/Transforms/Vectorize/VPlan.h | 8 ++ .../Transforms/Vectorize/VPlanTransforms.cpp | 54 ++++++++++ .../Transforms/Vectorize/VPlanTransforms.h | 5 + llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 18 ++++ llvm/lib/Transforms/Vectorize/VPlanUtils.h | 6 ++ .../AArch64/conditional-branches-cost.ll | 12 +-- .../LoopVectorize/AArch64/store-costs-sve.ll | 16 +-- .../vf-will-not-generate-any-vector-insts.ll | 6 +- .../LoopVectorize/X86/cost-model.ll | 100 ++++++++++-------- .../LoopVectorize/X86/uniform_mem_op.ll | 2 +- ...nd-sink-mem-ops-with-invariant-pointers.ll | 6 +- .../interleaved-accesses-metadata.ll | 6 +- .../multiple-strides-vectorization.ll | 16 +-- .../pointer-select-runtime-checks.ll | 6 +- llvm/test/Transforms/LoopVectorize/pr50686.ll | 12 +-- .../LoopVectorize/reduction-align.ll | 6 +- .../LoopVectorize/runtime-checks-hoist.ll | 10 +- .../LoopVectorize/single-scalar-cast-minbw.ll | 6 +- ...ive-path-inner-loop-with-runtime-checks.ll | 2 +- .../AArch64/hoist-load-from-vector-loop.ll | 66 ++++++++++-- 22 files changed, 265 insertions(+), 110 deletions(-) diff --git a/llvm/include/llvm/Analysis/ScopedNoAliasAA.h b/llvm/include/llvm/Analysis/ScopedNoAliasAA.h index 942cc6f2a4b2b..dbe1afa50ee3a 100644 --- a/llvm/include/llvm/Analysis/ScopedNoAliasAA.h +++ b/llvm/include/llvm/Analysis/ScopedNoAliasAA.h @@ -46,12 +46,12 @@ class ScopedNoAliasAAResult : public AAResultBase { LLVM_ABI ModRefInfo getModRefInfo(const CallBase *Call1, const CallBase *Call2, AAQueryInfo &AAQI); - LLVM_ABI void + LLVM_ABI static void collectScopedDomains(const MDNode *NoAlias, - SmallPtrSetImpl &Domains) const; + SmallPtrSetImpl &Domains); -private: - bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias) const; + LLVM_ABI static bool mayAliasInScopes(const MDNode *Scopes, + const MDNode *NoAlias); }; /// Analysis pass providing a never-invalidated alias analysis result. diff --git a/llvm/lib/Analysis/ScopedNoAliasAA.cpp b/llvm/lib/Analysis/ScopedNoAliasAA.cpp index 4d6c0cc71f898..d24ad0255256c 100644 --- a/llvm/lib/Analysis/ScopedNoAliasAA.cpp +++ b/llvm/lib/Analysis/ScopedNoAliasAA.cpp @@ -116,7 +116,7 @@ static void collectMDInDomain(const MDNode *List, const MDNode *Domain, /// Collect the set of scoped domains relevant to the noalias scopes. void ScopedNoAliasAAResult::collectScopedDomains( - const MDNode *NoAlias, SmallPtrSetImpl &Domains) const { + const MDNode *NoAlias, SmallPtrSetImpl &Domains) { if (!NoAlias) return; assert(Domains.empty() && "Domains should be empty"); @@ -127,7 +127,7 @@ void ScopedNoAliasAAResult::collectScopedDomains( } bool ScopedNoAliasAAResult::mayAliasInScopes(const MDNode *Scopes, - const MDNode *NoAlias) const { + const MDNode *NoAlias) { if (!Scopes || !NoAlias) return true; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index c81834e401726..a88ddf217da9b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -32,6 +32,7 @@ #include "llvm/ADT/ilist.h" #include "llvm/ADT/ilist_node.h" #include "llvm/Analysis/IVDescriptors.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/FMF.h" @@ -981,6 +982,13 @@ class VPIRMetadata { /// Intersect this VPIRMetada object with \p MD, keeping only metadata /// nodes that are common to both. void intersect(const VPIRMetadata &MD); + + /// Get metadata of kind \p Kind. Returns nullptr if not found. + MDNode *getMetadata(unsigned Kind) const { + auto It = + find_if(Metadata, [Kind](const auto &P) { return P.first == Kind; }); + return It != Metadata.end() ? It->second : nullptr; + } }; /// This is a concrete Recipe that models a single VPlan-level instruction. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 89118b49bed44..26563242de283 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -24,15 +24,20 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/InstSimplifyFolder.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ScalarEvolutionPatternMatch.h" +#include "llvm/Analysis/ScopedNoAliasAA.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" #include "llvm/Support/Casting.h" #include "llvm/Support/TypeSize.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" @@ -2401,6 +2406,7 @@ void VPlanTransforms::optimize(VPlan &Plan) { runPass(removeDeadRecipes, Plan); runPass(createAndOptimizeReplicateRegions, Plan); + runPass(hoistInvariantLoads, Plan); runPass(mergeBlocksIntoPredecessors, Plan); runPass(licm, Plan); } @@ -3914,6 +3920,54 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) { } } +void VPlanTransforms::hoistInvariantLoads(VPlan &Plan) { + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + + // Collect candidate loads with invariant addresses and noalias scopes + // metadata and memory-writing recipes with noalias metadata. + SmallVector> CandidateLoads; + SmallVector Stores; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_shallow(LoopRegion->getEntry()))) { + for (VPRecipeBase &R : *VPBB) { + // Only handle single-scalar replicated loads with invariant addresses. + if (auto *RepR = dyn_cast(&R)) { + if (RepR->isPredicated() || !RepR->isSingleScalar() || + RepR->getOpcode() != Instruction::Load) + continue; + + VPValue *Addr = RepR->getOperand(0); + if (Addr->isDefinedOutsideLoopRegions()) { + MemoryLocation Loc = *vputils::getMemoryLocation(*RepR); + if (!Loc.AATags.Scope) + continue; + CandidateLoads.push_back({RepR, Loc}); + } + } + if (R.mayWriteToMemory()) { + auto Loc = vputils::getMemoryLocation(R); + if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias) + return; + Stores.push_back(*Loc); + } + } + } + + VPBasicBlock *Preheader = Plan.getVectorPreheader(); + for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) { + // Hoist the load to the preheader if it doesn't alias with any stores + // according to the noalias metadata. Other loads should have been hoisted + // by other passes + const AAMDNodes &LoadAA = LoadLoc.AATags; + if (all_of(Stores, [&](const MemoryLocation &StoreLoc) { + return !ScopedNoAliasAAResult::mayAliasInScopes( + LoadAA.Scope, StoreLoc.AATags.NoAlias); + })) { + LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi()); + } + } +} + void VPlanTransforms::materializeConstantVectorTripCount( VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index a44a4f69c917b..708ea4185e1cb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -309,6 +309,11 @@ struct VPlanTransforms { /// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors. static void materializeBroadcasts(VPlan &Plan); + /// Hoist single-scalar loads with invariant addresses out of the vector loop + /// to the preheader, if they are proven not to alias with any stores in the + /// plan using noalias metadata. + static void hoistInvariantLoads(VPlan &Plan); + // Materialize vector trip counts for constants early if it can simply be // computed as (Original TC / VF * UF) * VF * UF. static void diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 3bc2dfd623777..2536d61392ed1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -11,6 +11,7 @@ #include "VPlanDominatorTree.h" #include "VPlanPatternMatch.h" #include "llvm/ADT/TypeSwitch.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" using namespace llvm; @@ -393,3 +394,20 @@ bool VPBlockUtils::isLatch(const VPBlockBase *VPB, return VPB->getNumSuccessors() == 2 && VPBlockUtils::isHeader(VPB->getSuccessors()[1], VPDT); } + +std::optional +vputils::getMemoryLocation(const VPRecipeBase &R) { + return TypeSwitch>(&R) + .Case( + [](auto *S) { + MemoryLocation Loc; + // Populate noalias metadata from VPIRMetadata. + if (MDNode *NoAliasMD = S->getMetadata(LLVMContext::MD_noalias)) + Loc.AATags.NoAlias = NoAliasMD; + if (MDNode *AliasScopeMD = + S->getMetadata(LLVMContext::MD_alias_scope)) + Loc.AATags.Scope = AliasScopeMD; + return Loc; + }) + .Default([](auto *) { return std::nullopt; }); +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 51bafe0846141..38073380eb54c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -13,6 +13,7 @@ #include "llvm/Support/Compiler.h" namespace llvm { +class MemoryLocation; class ScalarEvolution; class SCEV; } // namespace llvm @@ -74,6 +75,11 @@ getRecipesForUncountableExit(VPlan &Plan, SmallVectorImpl &Recipes, SmallVectorImpl &GEPs); +/// Return a MemoryLocation for \p R with noalias metadata populated from +/// \p R, if the recipe is supported and std::nullopt otherwise. The pointer of +/// the location is conservatively set to nullptr. +std::optional getMemoryLocation(const VPRecipeBase &R); + /// Extracts and returns NoWrap and FastMath flags from the induction binop in /// \p ID. inline VPIRFlags getFlagsFromIndDesc(const InductionDescriptor &ID) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index cb4bd793013b1..9609982b2c68f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -386,7 +386,7 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt ; DEFAULT-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], ptr [[E:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { ; DEFAULT-NEXT: [[ENTRY:.*:]] ; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 60 +; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 28 ; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; DEFAULT: [[VECTOR_MEMCHECK]]: ; DEFAULT-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[E]], i64 4 @@ -427,16 +427,16 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt ; DEFAULT: [[VECTOR_PH]]: ; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 ; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] -; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] -; DEFAULT: [[VECTOR_BODY]]: -; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE33:.*]] ] -; DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META8:![0-9]+]] +; DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META8:![0-9]+]] ; DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META11:![0-9]+]] +; DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META13:![0-9]+]] ; DEFAULT-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], [[TMP3]] -; DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META13:![0-9]+]] ; DEFAULT-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP5]] ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] +; DEFAULT: [[VECTOR_BODY]]: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE33:.*]] ] ; DEFAULT-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[D]], i64 [[INDEX]] ; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; DEFAULT: [[PRED_STORE_IF]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll index 0d8a1021bd438..50807df51c99e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll @@ -132,15 +132,15 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT: vector.ph: ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[X]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT]], <16 x i16> poison, <16 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP0:%.*]] = trunc <16 x i16> [[BROADCAST_SPLAT]] to <16 x i8> -; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] -; DEFAULT: vector.body: -; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; DEFAULT-NEXT: [[TMP1:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META6:![0-9]+]] ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i64> poison, i64 [[TMP1]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT2]], <16 x i64> poison, <16 x i32> zeroinitializer ; DEFAULT-NEXT: [[TMP2:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT3]] to <16 x i8> +; DEFAULT-NEXT: [[TMP0:%.*]] = trunc <16 x i16> [[BROADCAST_SPLAT]] to <16 x i8> ; DEFAULT-NEXT: [[TMP3:%.*]] = and <16 x i8> [[TMP2]], [[TMP0]] +; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] +; DEFAULT: vector.body: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]] ; DEFAULT-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 16 ; DEFAULT-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP4]], align 1, !alias.scope [[META9:![0-9]+]], !noalias [[META6]] @@ -156,15 +156,15 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <8 x i16> poison, i16 [[X]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT4]], <8 x i16> poison, <8 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP7:%.*]] = trunc <8 x i16> [[BROADCAST_SPLAT5]] to <8 x i8> -; DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] -; DEFAULT: vec.epilog.vector.body: -; DEFAULT-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; DEFAULT-NEXT: [[TMP8:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META6]] ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i64> poison, i64 [[TMP8]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT7]], <8 x i64> poison, <8 x i32> zeroinitializer ; DEFAULT-NEXT: [[TMP9:%.*]] = trunc <8 x i64> [[BROADCAST_SPLAT8]] to <8 x i8> +; DEFAULT-NEXT: [[TMP7:%.*]] = trunc <8 x i16> [[BROADCAST_SPLAT5]] to <8 x i8> ; DEFAULT-NEXT: [[TMP10:%.*]] = and <8 x i8> [[TMP9]], [[TMP7]] +; DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; DEFAULT: vec.epilog.vector.body: +; DEFAULT-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; DEFAULT-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX6]] ; DEFAULT-NEXT: store <8 x i8> [[TMP10]], ptr [[TMP11]], align 1, !alias.scope [[META9]], !noalias [[META6]] ; DEFAULT-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 8 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll index ed797fcd6c026..dca4f47738309 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll @@ -17,15 +17,15 @@ define void @vf_will_not_generate_any_vector_insts(ptr %src, ptr %dst) { ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[DST]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 100, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SRC]], align 4, !alias.scope [[META0:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[TMP6]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer ; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[BROADCAST_SPLAT3]], align 4 [[BROADCAST_SPLAT]], splat (i1 true), i32 [[TMP5]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]] ; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index 725fa49c0930c..b3c45a565a8fe 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -329,72 +329,85 @@ for.end: define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 { ; CHECK-LABEL: @multi_exit( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[UMAX6:%.*]] = call i64 @llvm.umax.i64(i64 [[B:%.*]], i64 1) -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX6]], -1 +; CHECK-NEXT: [[UMAX9:%.*]] = call i64 @llvm.umax.i64(i64 [[B:%.*]], i64 1) +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX9]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = freeze i64 [[TMP0]] -; CHECK-NEXT: [[UMIN7:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[A:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = add nuw i64 [[UMIN7]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 28 +; CHECK-NEXT: [[UMIN10:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = add nuw i64 [[UMIN10]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 24 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[B]], i64 1) ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[UMAX]], -1 ; CHECK-NEXT: [[TMP4:%.*]] = freeze i64 [[TMP3]] ; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP4]], i64 [[A]]) -; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[UMIN]] to i32 -; CHECK-NEXT: [[TMP7:%.*]] = add i32 1, [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[UMIN]], 4294967295 -; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] -; CHECK-NEXT: br i1 [[TMP10]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[UMIN]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 1, [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMIN]], 4294967295 +; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[UMIN]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = icmp slt i32 [[TMP10]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt i64 [[UMIN]], 4294967295 +; CHECK-NEXT: [[TMP13:%.*]] = or i1 [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP9]], [[TMP13]] +; CHECK-NEXT: br i1 [[TMP14]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 1 -; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC_1:%.*]], i64 8 ; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC_2:%.*]], i64 8 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC_1]], [[SCEVGEP]] +; CHECK-NEXT: [[UMAX3:%.*]] = call i64 @llvm.umax.i64(i64 [[B]], i64 1) +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[UMAX3]], -1 +; CHECK-NEXT: [[TMP16:%.*]] = freeze i64 [[TMP15]] +; CHECK-NEXT: [[UMIN4:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP16]], i64 [[A]]) +; CHECK-NEXT: [[TMP17:%.*]] = shl i64 [[UMIN4]], 3 +; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 8 +; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[SRC_3:%.*]], i64 [[TMP18]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC_2]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] -; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC_2]], [[SCEVGEP]] -; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] -; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: [[BOUND06:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP5]] +; CHECK-NEXT: [[BOUND17:%.*]] = icmp ult ptr [[SRC_3]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT8:%.*]] = and i1 [[BOUND06]], [[BOUND17]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT8]] ; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 4, i64 [[N_MOD_VF]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP12]] -; CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[SRC_1]], align 8, !alias.scope [[META6:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP13]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 4, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr [[SRC_2]], align 8, !alias.scope [[META6:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP21]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[SRC_2]], align 8, !alias.scope [[META9:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP14]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT9]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT10]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = and <2 x i1> [[TMP16]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = zext <2 x i1> [[TMP17]] to <2 x i8> -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i8> [[TMP18]], i32 1 -; CHECK-NEXT: store i8 [[TMP19]], ptr [[DST]], align 1, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] +; CHECK-NEXT: [[TMP22:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[TMP23:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[SRC_3]], i32 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i32 2 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP25]], align 8, !alias.scope [[META9:![0-9]+]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq <2 x i64> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = and <2 x i1> [[TMP23]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = zext <2 x i1> [[TMP27]] to <2 x i8> +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x i8> [[TMP28]], i32 1 +; CHECK-NEXT: store i8 [[TMP29]], ptr [[DST]], align 1, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL11:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV_1_WIDE:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT_WIDE:%.*]], [[LOOP_LATCH:%.*]] ] -; CHECK-NEXT: [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL8]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP_LATCH]] ] +; CHECK-NEXT: [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL11]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP_LATCH]] ] ; CHECK-NEXT: [[EC_1:%.*]] = icmp ult i64 [[IV_1_WIDE]], [[A]] ; CHECK-NEXT: br i1 [[EC_1]], label [[LOOP_LATCH]], label [[EXIT:%.*]] ; CHECK: loop.latch: +; CHECK-NEXT: [[SRC_1:%.*]] = getelementptr inbounds i64, ptr [[SRC_3]], i32 [[IV_1]] ; CHECK-NEXT: [[L_1:%.*]] = load i64, ptr [[SRC_1]], align 8 ; CHECK-NEXT: [[L_2:%.*]] = load i64, ptr [[SRC_2]], align 8 ; CHECK-NEXT: [[CMP55_US:%.*]] = icmp eq i64 [[L_1]], 0 @@ -405,7 +418,7 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 { ; CHECK-NEXT: [[IV_1_NEXT]] = add i32 [[IV_1]], 1 ; CHECK-NEXT: [[IV_1_NEXT_WIDE]] = zext i32 [[IV_1_NEXT]] to i64 ; CHECK-NEXT: [[EC_2:%.*]] = icmp ult i64 [[IV_1_NEXT_WIDE]], [[B]] -; CHECK-NEXT: br i1 [[EC_2]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[EC_2]], label [[LOOP1]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -419,7 +432,8 @@ loop: br i1 %ec.1, label %loop.latch, label %exit loop.latch: - %l.1 = load i64, ptr %src.1, align 8 + %gep.src.1 = getelementptr inbounds i64, ptr %src.1, i32 %iv.1 + %l.1 = load i64, ptr %gep.src.1, align 8 %l.2 = load i64, ptr %src.2, align 8 %cmp55.us = icmp eq i64 %l.1, 0 %cmp.i.us = icmp ne i64 %l.2, 0 diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll index 63f9a1310d15a..dbd7019188d07 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll @@ -278,10 +278,10 @@ define void @uniform_copy(ptr %A, ptr %B) { ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META12:![0-9]+]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META12:![0-9]+]] ; CHECK-NEXT: store i32 [[TMP0]], ptr [[B]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META12]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 diff --git a/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll b/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll index 8615401af34f8..7bbc186dcbbae 100644 --- a/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll +++ b/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll @@ -21,12 +21,12 @@ define void @hoist_invariant_load_noalias_due_to_memchecks(ptr %dst, ptr %invari ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META0:![0-9]+]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[INDEX]] ; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-metadata.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-metadata.ll index bd0fd77e7c391..9bbd67059e84d 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-metadata.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-metadata.ll @@ -102,6 +102,9 @@ define void @ir_tbaa_different(ptr %base, ptr %end, ptr %src) { ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[SRC]], align 4, !alias.scope [[META10:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP4]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -109,9 +112,6 @@ define void @ir_tbaa_different(ptr %base, ptr %end, ptr %src) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[SRC]], align 4, !alias.scope [[META10:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x float>, ptr [[NEXT_GEP]], align 4, !alias.scope [[META13:![0-9]+]], !noalias [[META10]] ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x float> [[WIDE_VEC]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x float> [[WIDE_VEC]], <4 x float> poison, <2 x i32> diff --git a/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll b/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll index d21621e46b79c..05cfb1957a766 100644 --- a/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll @@ -61,14 +61,14 @@ define void @Test(ptr nocapture %obj, i64 %z) #0 { ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[Z]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[Z]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[OBJ]], i64 0, i32 0, i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4, !alias.scope [[META0:![0-9]+]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META3:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP13]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4, !alias.scope [[META3:![0-9]+]] ; CHECK-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] @@ -125,14 +125,14 @@ define void @Test(ptr nocapture %obj, i64 %z) #0 { ; CHECK-HOIST: vector.ph: ; CHECK-HOIST-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[Z]], 4 ; CHECK-HOIST-NEXT: [[N_VEC:%.*]] = sub i64 [[Z]], [[N_MOD_VF]] +; CHECK-HOIST-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !alias.scope [[META0:![0-9]+]] +; CHECK-HOIST-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; CHECK-HOIST-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-HOIST-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-HOIST: vector.body: ; CHECK-HOIST-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-HOIST-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[OBJ]], i64 0, i32 0, i64 [[INDEX]] -; CHECK-HOIST-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META0:![0-9]+]] -; CHECK-HOIST-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP3]], align 4, !alias.scope [[META3:![0-9]+]] -; CHECK-HOIST-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 -; CHECK-HOIST-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-HOIST-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META3:![0-9]+]] ; CHECK-HOIST-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] ; CHECK-HOIST-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[INDEX]] ; CHECK-HOIST-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/pointer-select-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/pointer-select-runtime-checks.ll index 4d50a814b621d..f49a2d90b0e84 100644 --- a/llvm/test/Transforms/LoopVectorize/pointer-select-runtime-checks.ll +++ b/llvm/test/Transforms/LoopVectorize/pointer-select-runtime-checks.ll @@ -23,17 +23,17 @@ define void @test1_select_invariant(ptr %src.1, ptr %src.2, ptr %dst, i1 %c, i8 ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP6:%.*]] = load i8, ptr [[PTR_SEL]], align 8, !alias.scope [[META0:![0-9]+]] ; CHECK-NEXT: [[IND_END:%.*]] = trunc i32 [[N_VEC]] to i8 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i8 ; CHECK-NEXT: [[INDUCTION2:%.*]] = add i8 [[OFFSET_IDX]], 1 -; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[PTR_SEL]], align 8, !alias.scope [[META0:![0-9]+]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i8 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i8 [[INDUCTION2]] -; CHECK-NEXT: store i8 [[TMP10]], ptr [[TMP11]], align 2, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] -; CHECK-NEXT: store i8 [[TMP10]], ptr [[TMP8]], align 2, !alias.scope [[META3]], !noalias [[META0]] +; CHECK-NEXT: store i8 [[TMP6]], ptr [[TMP11]], align 2, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] +; CHECK-NEXT: store i8 [[TMP6]], ptr [[TMP8]], align 2, !alias.scope [[META3]], !noalias [[META0]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/pr50686.ll b/llvm/test/Transforms/LoopVectorize/pr50686.ll index be9110ce0093a..5a56cdfefbb8f 100644 --- a/llvm/test/Transforms/LoopVectorize/pr50686.ll +++ b/llvm/test/Transforms/LoopVectorize/pr50686.ll @@ -15,17 +15,17 @@ define void @m(ptr nocapture %p, ptr nocapture %p2, i32 %q) { ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[P2]], align 4, !alias.scope [[META0:![0-9]+]] -; CHECK-NEXT: [[TMP1:%.*]] = sub nsw i32 0, [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX9_2]], align 4, !alias.scope [[META0:![0-9]+]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX9_1]], align 4, !alias.scope [[META0]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[P2]], align 4, !alias.scope [[META0]] +; CHECK-NEXT: [[TMP1:%.*]] = sub nsw i32 0, [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = sub nsw i32 [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX9_2]], align 4, !alias.scope [[META0]] ; CHECK-NEXT: [[TMP5:%.*]] = sub nsw i32 [[TMP3]], [[TMP4]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDEX]] ; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP7]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/reduction-align.ll b/llvm/test/Transforms/LoopVectorize/reduction-align.ll index 028eb3b05957d..0c45b96874da2 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-align.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-align.ll @@ -23,13 +23,13 @@ define void @fn(ptr %hbuf, ptr %ref, i32 %height) { ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[HEIGHT]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[HEIGHT]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[REF]], align 1, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[REF]], align 1, !alias.scope [[META0:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1]] = add <4 x i16> [[BROADCAST_SPLAT]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll index a1329598529fd..25f40be238338 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll @@ -1511,14 +1511,14 @@ define void @stride_check_known_via_loop_guard(ptr %C, ptr %A, i32 %Acols) { ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[ARRAYIDX_US]], align 8, !alias.scope [[META69:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds double, ptr [[C]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX_US]], align 8, !alias.scope [[META69:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[TMP1]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: store <4 x double> [[BROADCAST_SPLAT]], ptr [[TMP0]], align 8, !alias.scope [[META72:![0-9]+]], !noalias [[META69]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[C]], i32 [[INDEX]] +; CHECK-NEXT: store <4 x double> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 8, !alias.scope [[META72:![0-9]+]], !noalias [[META69]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP74:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll index 70adac2103feb..fb25b2bc7b906 100644 --- a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll +++ b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll @@ -80,13 +80,13 @@ define void @single_scalar_cast_stored(ptr %src, ptr %dst, i32 %n) { ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC]], align 2, !alias.scope [[META4:![0-9]+]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i16 [[TMP0]], 0 ; CHECK-NEXT: [[TMP4:%.*]] = and i16 [[TMP0]], 15 ; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], i16 0, i16 [[TMP4]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: store i16 [[TMP5]], ptr [[DST]], align 2, !alias.scope [[META7:![0-9]+]], !noalias [[META4]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll index e5e02674704f9..22cf860c8b58c 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll @@ -50,6 +50,7 @@ define void @expand(ptr %src, ptr %dst, i64 %0) { ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP8]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP8]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP19:%.*]] = load double, ptr [[SRC]], align 8, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] ; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer @@ -58,7 +59,6 @@ define void @expand(ptr %src, ptr %dst, i64 %0) { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP19:%.*]] = load double, ptr [[SRC]], align 8, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] ; CHECK-NEXT: [[TMP20:%.*]] = shl <4 x i64> [[VEC_IND]], splat (i64 1) ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0 ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1 diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll index a35bcf1c5a88d..c17b15138329c 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll @@ -8,19 +8,69 @@ target triple = "arm64-apple-macosx" define void @hoist_invariant_load(ptr %invariant_ptr, i64 %num_elements, ptr %array) { ; CHECK-LABEL: define void @hoist_invariant_load( ; CHECK-SAME: ptr readonly captures(none) [[INVARIANT_PTR:%.*]], i64 [[NUM_ELEMENTS:%.*]], ptr captures(none) [[ARRAY:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[NUM_ELEMENTS]], 0 -; CHECK-NEXT: br i1 [[CMP1_NOT]], label %[[EXIT:.*]], label %[[LOOP_LATCH:.*]] -; CHECK: [[LOOP_LATCH]]: -; CHECK-NEXT: [[I2:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP_LATCH]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br i1 [[CMP1_NOT]], label %[[EXIT:.*]], label %[[LOOP_LATCH_PREHEADER:.*]] +; CHECK: [[LOOP_LATCH_PREHEADER]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_ELEMENTS]], 11 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[LOOP_LATCH_PREHEADER6:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[NUM_ELEMENTS]], 5 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[ARRAY]], i64 [[TMP0]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP1]], i64 -24 +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[INVARIANT_PTR]], i64 8 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[ARRAY]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[INVARIANT_PTR]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[LOOP_LATCH_PREHEADER6]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[NUM_ELEMENTS]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 4, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_ELEMENTS]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[INVARIANT_PTR]], align 8, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[I2:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr nusw %"class.dealii::VectorizedArray", ptr [[ARRAY]], i64 [[I2]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr %"class.dealii::VectorizedArray", ptr [[ARRAY]], i64 [[I2]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i64 32 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr %"class.dealii::VectorizedArray", ptr [[ARRAY]], i64 [[I2]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 64 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr %"class.dealii::VectorizedArray", ptr [[ARRAY]], i64 [[I2]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i64 96 +; CHECK-NEXT: [[TMP12:%.*]] = load <5 x double>, ptr [[GEP]], align 8, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <5 x double> [[TMP12]], <5 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = load <5 x double>, ptr [[TMP9]], align 8, !alias.scope [[META3]], !noalias [[META0]] +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <5 x double> [[TMP13]], <5 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[TMP14]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[TMP14]], i64 1 +; CHECK-NEXT: [[TMP17:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC5]] +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x double> [[TMP17]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x double> [[TMP17]], i64 1 +; CHECK-NEXT: store double [[TMP15]], ptr [[GEP]], align 8, !alias.scope [[META3]], !noalias [[META0]] +; CHECK-NEXT: store double [[TMP16]], ptr [[TMP7]], align 8, !alias.scope [[META3]], !noalias [[META0]] +; CHECK-NEXT: store double [[TMP18]], ptr [[TMP9]], align 8, !alias.scope [[META3]], !noalias [[META0]] +; CHECK-NEXT: store double [[TMP19]], ptr [[TMP11]], align 8, !alias.scope [[META3]], !noalias [[META0]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[I2]], 4 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label %[[LOOP_LATCH_PREHEADER6]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[LOOP_LATCH_PREHEADER6]]: +; CHECK-NEXT: [[I2_PH:%.*]] = phi i64 [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[LOOP_LATCH_PREHEADER]] ], [ [[N_VEC]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: br label %[[LOOP_LATCH:.*]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[I3:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP_LATCH]] ], [ [[I2_PH]], %[[LOOP_LATCH_PREHEADER6]] ] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr nusw %"class.dealii::VectorizedArray", ptr [[ARRAY]], i64 [[I3]] ; CHECK-NEXT: [[INVARIANT_VAL:%.*]] = load double, ptr [[INVARIANT_PTR]], align 8 -; CHECK-NEXT: [[ARRAY_VAL:%.*]] = load double, ptr [[GEP]], align 8 +; CHECK-NEXT: [[ARRAY_VAL:%.*]] = load double, ptr [[GEP1]], align 8 ; CHECK-NEXT: [[SUM:%.*]] = fadd double [[INVARIANT_VAL]], [[ARRAY_VAL]] -; CHECK-NEXT: store double [[SUM]], ptr [[GEP]], align 8 -; CHECK-NEXT: [[I_NEXT]] = add nuw i64 [[I2]], 1 +; CHECK-NEXT: store double [[SUM]], ptr [[GEP1]], align 8 +; CHECK-NEXT: [[I_NEXT]] = add nuw i64 [[I3]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_NEXT]], [[NUM_ELEMENTS]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_LATCH]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_LATCH]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; From f369a53d823b003ece9fee1020d3780c974f1db5 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Tue, 18 Nov 2025 09:49:42 +0000 Subject: [PATCH 16/35] [DAGCombiner] Fold select into partial.reduce.add operands. (#167857) This generates more optimal codegen when using partial reductions with predication. ``` partial_reduce_*mla(acc, sel(p, mul(*ext(a), *ext(b)), splat(0)), splat(1)) -> partial_reduce_*mla(acc, sel(p, a, splat(0)), b) partial.reduce.*mla(acc, sel(p, *ext(op), splat(0)), splat(1)) -> partial.reduce.*mla(acc, sel(p, op, splat(0)), splat(trunc(1))) ``` --- llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 4 + llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 68 ++++++-- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 5 + .../partial-reduction-add-predicated.ll | 159 ++++++++++++++++++ llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll | 39 +++-- 5 files changed, 247 insertions(+), 28 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/partial-reduction-add-predicated.ll diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index cd466dceb900f..cfc8a4243e894 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -1968,6 +1968,10 @@ LLVM_ABI bool isOnesOrOnesSplat(SDValue N, bool AllowUndefs = false); /// Build vector implicit truncation is allowed. LLVM_ABI bool isZeroOrZeroSplat(SDValue N, bool AllowUndefs = false); +/// Return true if the value is a constant (+/-)0.0 floating-point value or a +/// splatted vector thereof (with no undefs). +LLVM_ABI bool isZeroOrZeroSplatFP(SDValue N, bool AllowUndefs = false); + /// Return true if \p V is either a integer or FP constant. inline bool isIntOrFPConstant(SDValue V) { return isa(V) || isa(V); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c9513611e6dcb..94afdc5db6613 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -13018,22 +13018,34 @@ SDValue DAGCombiner::visitPARTIAL_REDUCE_MLA(SDNode *N) { return SDValue(); } -// partial_reduce_*mla(acc, mul(ext(a), ext(b)), splat(1)) +// partial_reduce_*mla(acc, mul(*ext(a), *ext(b)), splat(1)) // -> partial_reduce_*mla(acc, a, b) // -// partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1)) -// -> partial_reduce_*mla(acc, x, C) +// partial_reduce_*mla(acc, mul(*ext(x), splat(C)), splat(1)) +// -> partial_reduce_*mla(acc, x, splat(C)) // -// partial_reduce_fmla(acc, fmul(fpext(a), fpext(b)), splat(1.0)) -// -> partial_reduce_fmla(acc, a, b) +// partial_reduce_*mla(acc, sel(p, mul(*ext(a), *ext(b)), splat(0)), splat(1)) +// -> partial_reduce_*mla(acc, sel(p, a, splat(0)), b) +// +// partial_reduce_*mla(acc, sel(p, mul(*ext(a), splat(C)), splat(0)), splat(1)) +// -> partial_reduce_*mla(acc, sel(p, a, splat(0)), splat(C)) SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { SDLoc DL(N); auto *Context = DAG.getContext(); SDValue Acc = N->getOperand(0); SDValue Op1 = N->getOperand(1); SDValue Op2 = N->getOperand(2); - unsigned Opc = Op1->getOpcode(); + + // Handle predication by moving the SELECT into the operand of the MUL. + SDValue Pred; + if (Opc == ISD::VSELECT && (isZeroOrZeroSplat(Op1->getOperand(2)) || + isZeroOrZeroSplatFP(Op1->getOperand(2)))) { + Pred = Op1->getOperand(0); + Op1 = Op1->getOperand(1); + Opc = Op1->getOpcode(); + } + if (Opc != ISD::MUL && Opc != ISD::FMUL && Opc != ISD::SHL) return SDValue(); @@ -13068,6 +13080,19 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { SDValue LHSExtOp = LHS->getOperand(0); EVT LHSExtOpVT = LHSExtOp.getValueType(); + // When Pred is non-zero, set Op = select(Pred, Op, splat(0)) and freeze + // OtherOp to keep the same semantics when moving the selects into the MUL + // operands. + auto ApplyPredicate = [&](SDValue &Op, SDValue &OtherOp) { + if (Pred) { + EVT OpVT = Op.getValueType(); + SDValue Zero = OpVT.isFloatingPoint() ? DAG.getConstantFP(0.0, DL, OpVT) + : DAG.getConstant(0, DL, OpVT); + Op = DAG.getSelect(DL, OpVT, Pred, Op, Zero); + OtherOp = DAG.getFreeze(OtherOp); + } + }; + // partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1)) // -> partial_reduce_*mla(acc, x, C) APInt C; @@ -13090,8 +13115,9 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { TLI.getTypeToTransformTo(*Context, LHSExtOpVT))) return SDValue(); - return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, LHSExtOp, - DAG.getConstant(CTrunc, DL, LHSExtOpVT)); + SDValue C = DAG.getConstant(CTrunc, DL, LHSExtOpVT); + ApplyPredicate(C, LHSExtOp); + return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, LHSExtOp, C); } unsigned RHSOpcode = RHS->getOpcode(); @@ -13132,17 +13158,17 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { TLI.getTypeToTransformTo(*Context, LHSExtOpVT))) return SDValue(); + ApplyPredicate(RHSExtOp, LHSExtOp); return DAG.getNode(NewOpc, DL, N->getValueType(0), Acc, LHSExtOp, RHSExtOp); } -// partial.reduce.umla(acc, zext(op), splat(1)) -// -> partial.reduce.umla(acc, op, splat(trunc(1))) -// partial.reduce.smla(acc, sext(op), splat(1)) -// -> partial.reduce.smla(acc, op, splat(trunc(1))) +// partial.reduce.*mla(acc, *ext(op), splat(1)) +// -> partial.reduce.*mla(acc, op, splat(trunc(1))) // partial.reduce.sumla(acc, sext(op), splat(1)) // -> partial.reduce.smla(acc, op, splat(trunc(1))) -// partial.reduce.fmla(acc, fpext(op), splat(1.0)) -// -> partial.reduce.fmla(acc, op, splat(1.0)) +// +// partial.reduce.*mla(acc, sel(p, *ext(op), splat(0)), splat(1)) +// -> partial.reduce.*mla(acc, sel(p, op, splat(0)), splat(trunc(1))) SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) { SDLoc DL(N); SDValue Acc = N->getOperand(0); @@ -13152,7 +13178,15 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) { if (!llvm::isOneOrOneSplat(Op2) && !llvm::isOneOrOneSplatFP(Op2)) return SDValue(); + SDValue Pred; unsigned Op1Opcode = Op1.getOpcode(); + if (Op1Opcode == ISD::VSELECT && (isZeroOrZeroSplat(Op1->getOperand(2)) || + isZeroOrZeroSplatFP(Op1->getOperand(2)))) { + Pred = Op1->getOperand(0); + Op1 = Op1->getOperand(1); + Op1Opcode = Op1->getOpcode(); + } + if (!ISD::isExtOpcode(Op1Opcode) && Op1Opcode != ISD::FP_EXTEND) return SDValue(); @@ -13181,6 +13215,12 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) { ? DAG.getConstantFP(1, DL, UnextOp1VT) : DAG.getConstant(1, DL, UnextOp1VT); + if (Pred) { + SDValue Zero = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA + ? DAG.getConstantFP(0, DL, UnextOp1VT) + : DAG.getConstant(0, DL, UnextOp1VT); + Constant = DAG.getSelect(DL, UnextOp1VT, Pred, Constant, Zero); + } return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, UnextOp1, Constant); } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index c2b4c19846316..16fdef06d6679 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -12971,6 +12971,11 @@ bool llvm::isZeroOrZeroSplat(SDValue N, bool AllowUndefs) { return C && C->isZero(); } +bool llvm::isZeroOrZeroSplatFP(SDValue N, bool AllowUndefs) { + ConstantFPSDNode *C = isConstOrConstSplatFP(N, AllowUndefs); + return C && C->isZero(); +} + HandleSDNode::~HandleSDNode() { DropOperands(); } diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-add-predicated.ll b/llvm/test/CodeGen/AArch64/partial-reduction-add-predicated.ll new file mode 100644 index 0000000000000..24cdd0a852222 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/partial-reduction-add-predicated.ll @@ -0,0 +1,159 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64" + +define <4 x i32> @predicate_dot_fixed_length(<4 x i32> %acc, <16 x i1> %p, <16 x i8> %a, <16 x i8> %b) #0 { +; CHECK-LABEL: predicate_dot_fixed_length: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: sdot v0.4s, v2.16b, v1.16b +; CHECK-NEXT: ret + %ext.1 = sext <16 x i8> %a to <16 x i32> + %ext.2 = sext <16 x i8> %b to <16 x i32> + %mul = mul nsw <16 x i32> %ext.1, %ext.2 + %sel = select <16 x i1> %p, <16 x i32> %mul, <16 x i32> zeroinitializer + %red = call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %sel) + ret <4 x i32> %red +} + +define <4 x i32> @predicate_dot_by_C_fixed_length(<4 x i32> %acc, <16 x i1> %p, <16 x i8> %a) #0 { +; CHECK-LABEL: predicate_dot_by_C_fixed_length: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-NEXT: movi v3.16b, #127 +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: sdot v0.4s, v2.16b, v1.16b +; CHECK-NEXT: ret + %ext.1 = sext <16 x i8> %a to <16 x i32> + %mul = mul nsw <16 x i32> %ext.1, splat(i32 127) + %sel = select <16 x i1> %p, <16 x i32> %mul, <16 x i32> zeroinitializer + %red = call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %sel) + ret <4 x i32> %red +} + +define @predicate_dot_scalable( %acc, %p, %a, %b) #0 { +; CHECK-LABEL: predicate_dot_scalable: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: sel z2.b, p0, z2.b, z3.b +; CHECK-NEXT: sdot z0.s, z1.b, z2.b +; CHECK-NEXT: ret + %ext.1 = sext %a to + %ext.2 = sext %b to + %mul = mul nsw %ext.1, %ext.2 + %sel = select %p, %mul, zeroinitializer + %red = call @llvm.vector.partial.reduce.add( %acc, %sel) + ret %red +} + +define @predicate_dot_by_C_scalable( %acc, %p, %a) #0 { +; CHECK-LABEL: predicate_dot_by_C_scalable: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.b, p0/z, #127 // =0x7f +; CHECK-NEXT: sdot z0.s, z1.b, z2.b +; CHECK-NEXT: ret + %ext.1 = sext %a to + %mul = mul nsw %ext.1, splat(i32 127) + %sel = select %p, %mul, zeroinitializer + %red = call @llvm.vector.partial.reduce.add( %acc, %sel) + ret %red +} + +define <4 x i32> @predicate_ext_mul_fixed_length(<4 x i32> %acc, <16 x i1> %p, <16 x i8> %a) #0 { +; CHECK-LABEL: predicate_ext_mul_fixed_length: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v3.16b, #1 +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: sdot v0.4s, v2.16b, v1.16b +; CHECK-NEXT: ret + %ext = sext <16 x i8> %a to <16 x i32> + %sel = select <16 x i1> %p, <16 x i32> %ext, <16 x i32> zeroinitializer + %red = call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %sel) + ret <4 x i32> %red +} + +define @predicate_ext_mul_scalable( %acc, %p, %a) #0 { +; CHECK-LABEL: predicate_ext_mul_scalable: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.b, p0/z, #1 // =0x1 +; CHECK-NEXT: sdot z0.s, z1.b, z2.b +; CHECK-NEXT: ret + %ext = sext %a to + %sel = select %p, %ext, zeroinitializer + %red = call @llvm.vector.partial.reduce.add( %acc, %sel) + ret %red +} + +define <4 x float> @predicated_fdot_fixed_length(<4 x float> %acc, <8 x i1> %p, <8 x half> %a, <8 x half> %b) #1 { +; CHECK-LABEL: predicated_fdot_fixed_length: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2 +; CHECK-NEXT: shl v1.8h, v1.8h, #15 +; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: fdot z0.s, z2.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %ext.1 = fpext <8 x half> %a to <8 x float> + %ext.2 = fpext <8 x half> %b to <8 x float> + %mul = fmul <8 x float> %ext.1, %ext.2 + %sel = select <8 x i1> %p, <8 x float> %mul, <8 x float> zeroinitializer + %red = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %sel) + ret <4 x float> %red +} + +define @predicated_fdot_scalable( %acc, %p, %a, %b) #1 { +; CHECK-LABEL: predicated_fdot_scalable: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: sel z2.h, p0, z2.h, z3.h +; CHECK-NEXT: fdot z0.s, z1.h, z2.h +; CHECK-NEXT: ret + %ext.1 = fpext %a to + %ext.2 = fpext %b to + %mul = fmul %ext.1, %ext.2 + %sel = select %p, %mul, zeroinitializer + %red = call @llvm.vector.partial.reduce.fadd( %acc, %sel) + ret %red +} + +define <4 x float> @predicated_fpext_fmul_fixed_length(<4 x float> %acc, <8 x i1> %p, <8 x half> %a) #1 { +; CHECK-LABEL: predicated_fpext_fmul_fixed_length: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: movi v3.8h, #60, lsl #8 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2 +; CHECK-NEXT: shl v1.8h, v1.8h, #15 +; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: fdot z0.s, z2.h, z1.h +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %ext = fpext <8 x half> %a to <8 x float> + %sel = select <8 x i1> %p, <8 x float> %ext, <8 x float> zeroinitializer + %red = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %sel) + ret <4 x float> %red +} + +define @predicated_fpext_fmul_scalable( %acc, %p, %a) #1 { +; CHECK-LABEL: predicated_fpext_fmul_scalable: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: fmov z2.h, p0/m, #1.00000000 +; CHECK-NEXT: fdot z0.s, z1.h, z2.h +; CHECK-NEXT: ret + %ext = fpext %a to + %sel = select %p, %ext, zeroinitializer + %red = call @llvm.vector.partial.reduce.fadd( %acc, %sel) + ret %red +} + +attributes #0 = { nounwind "target-features"="+sve,+dotprod" } +attributes #1 = { nounwind "target-features"="+sve2p1,+dotprod" } diff --git a/llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll index 72bf1fa9a8327..d6384a6913efe 100644 --- a/llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll @@ -996,20 +996,31 @@ entry: } define @partial_reduce_select( %a, %b, %m) { -; CHECK-LABEL: partial_reduce_select: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; CHECK-NEXT: vsext.vf2 v12, v8 -; CHECK-NEXT: vsext.vf2 v14, v9 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vwmul.vv v8, v12, v14, v0.t -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; CHECK-NEXT: vadd.vv v8, v11, v8 -; CHECK-NEXT: vadd.vv v9, v9, v10 -; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: ret +; NODOT-LABEL: partial_reduce_select: +; NODOT: # %bb.0: # %entry +; NODOT-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; NODOT-NEXT: vsext.vf2 v12, v8 +; NODOT-NEXT: vsext.vf2 v14, v9 +; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; NODOT-NEXT: vmv.v.i v8, 0 +; NODOT-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; NODOT-NEXT: vwmul.vv v8, v12, v14, v0.t +; NODOT-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; NODOT-NEXT: vadd.vv v8, v11, v8 +; NODOT-NEXT: vadd.vv v9, v9, v10 +; NODOT-NEXT: vadd.vv v8, v9, v8 +; NODOT-NEXT: ret +; +; DOT-LABEL: partial_reduce_select: +; DOT: # %bb.0: # %entry +; DOT-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; DOT-NEXT: vmv.v.i v10, 0 +; DOT-NEXT: vmerge.vvm v10, v10, v9, v0 +; DOT-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; DOT-NEXT: vmv.v.i v9, 0 +; DOT-NEXT: vqdot.vv v9, v8, v10 +; DOT-NEXT: vmv.v.v v8, v9 +; DOT-NEXT: ret entry: %a.sext = sext %a to %b.sext = sext %b to From 542d88d2a861577433ea64e99b97cbfbf1c50b8a Mon Sep 17 00:00:00 2001 From: Felipe de Azevedo Piovezan Date: Tue, 18 Nov 2025 10:06:40 +0000 Subject: [PATCH 17/35] [lldb][nfc] Fix incorrect union usage in UnwindAssemblyInstEmulation (#168341) This is harmless due to the previous checks for > 0, but it is still confusing for the readers. --- .../InstEmulation/UnwindAssemblyInstEmulation.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp index 8437a51471ca2..987586b97dfdc 100644 --- a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp +++ b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp @@ -507,12 +507,12 @@ bool UnwindAssemblyInstEmulation::WriteRegister( case EmulateInstruction::eContextRelativeBranchImmediate: { if (context.GetInfoType() == EmulateInstruction::eInfoTypeISAAndImmediate && context.info.ISAAndImmediate.unsigned_data32 > 0) { - m_forward_branch_offset = - context.info.ISAAndImmediateSigned.signed_data32; + m_forward_branch_offset = context.info.ISAAndImmediate.unsigned_data32; } else if (context.GetInfoType() == EmulateInstruction::eInfoTypeISAAndImmediateSigned && context.info.ISAAndImmediateSigned.signed_data32 > 0) { - m_forward_branch_offset = context.info.ISAAndImmediate.unsigned_data32; + m_forward_branch_offset = + context.info.ISAAndImmediateSigned.signed_data32; } else if (context.GetInfoType() == EmulateInstruction::eInfoTypeImmediate && context.info.unsigned_immediate > 0) { From 8603552133c832080dac6de2460ebf5d2a1f1be0 Mon Sep 17 00:00:00 2001 From: Szymon Piotr Milczek Date: Tue, 18 Nov 2025 11:09:42 +0100 Subject: [PATCH 18/35] [MC] AsmLexer assert buffer is null-terminated at CurBuf.end() (#154972) AsmLexer expects the buffer it's provided for lexing to be NULL-terminated, where the NULL terminator is pointed to by `CurBuf.end()`. However, this expectation isn't explicitly stated anywhere. This commit adds a couple of comments as well as an assert as means of documenting this expectation. --- llvm/include/llvm/MC/MCParser/AsmLexer.h | 7 +++++++ llvm/lib/MC/MCParser/AsmLexer.cpp | 5 +++++ 2 files changed, 12 insertions(+) diff --git a/llvm/include/llvm/MC/MCParser/AsmLexer.h b/llvm/include/llvm/MC/MCParser/AsmLexer.h index c514b768637d1..7848fc706d5eb 100644 --- a/llvm/include/llvm/MC/MCParser/AsmLexer.h +++ b/llvm/include/llvm/MC/MCParser/AsmLexer.h @@ -44,6 +44,7 @@ class AsmLexer { SmallVector CurTok; const char *CurPtr = nullptr; + /// NULL-terminated buffer. NULL terminator must reside at `CurBuf.end()`. StringRef CurBuf; /// The location and description of the current error @@ -190,6 +191,12 @@ class AsmLexer { /// literals. void setLexHLASMStrings(bool V) { LexHLASMStrings = V; } + /// Set buffer to be lexed. + /// `Buf` must be NULL-terminated. NULL terminator must reside at `Buf.end()`. + /// `ptr` if provided must be in range [`Buf.begin()`, `buf.end()`] or NULL. + /// Specifies where lexing of buffer should begin. + /// `EndStatementAtEOF` specifies whether `AsmToken::EndOfStatement` should be + /// returned upon reaching end of buffer. LLVM_ABI void setBuffer(StringRef Buf, const char *ptr = nullptr, bool EndStatementAtEOF = true); diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp index 1af4a297babaa..8e4b7be98bdb6 100644 --- a/llvm/lib/MC/MCParser/AsmLexer.cpp +++ b/llvm/lib/MC/MCParser/AsmLexer.cpp @@ -119,6 +119,11 @@ AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) { void AsmLexer::setBuffer(StringRef Buf, const char *ptr, bool EndStatementAtEOF) { + // Buffer must be NULL-terminated. NULL terminator must reside at `Buf.end()`. + // It must be safe to dereference `Buf.end()`. + assert(*Buf.end() == '\0' && + "Buffer provided to AsmLexer lacks null terminator."); + CurBuf = Buf; if (ptr) From 128caa1ba37fe7f216226d24e8d616bab2d68ee9 Mon Sep 17 00:00:00 2001 From: Andrei Golubev Date: Tue, 18 Nov 2025 10:18:53 +0000 Subject: [PATCH 19/35] [mlir][bufferization] Refine tensor-buffer compatibility checks (#167705) Generally, to_tensor and to_buffer already perform sufficient verification. However, there are some unnecessarily strict constraints: * builtin tensor requires its buffer counterpart to always be memref * to_buffer on ranked tensor requires to always return memref These checks are assertions (i.e. preconditions), however, they actually prevent an apparently useful bufferization where builtin tensors could become custom buffers. Lift these assertions, maintaining the verification procedure unchanged, to allow builtin -> custom bufferizations at operation boundary level. --- .../IR/BufferizableOpInterface.cpp | 12 +--- .../Bufferization/IR/BufferizationDialect.cpp | 3 - mlir/test/Dialect/Bufferization/invalid.mlir | 60 +++++++++++++++++++ mlir/test/Dialect/Bufferization/ops.mlir | 37 ++++++++++++ mlir/test/lib/Dialect/Test/TestTypes.cpp | 18 ++++-- 5 files changed, 110 insertions(+), 20 deletions(-) diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp index e0cf353da207f..9b11270e7bbe2 100644 --- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp +++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp @@ -680,16 +680,6 @@ bool AnalysisState::hasUndefinedContents(OpOperand *opOperand) const { return false; } -// bufferization.to_buffer is not allowed to change the rank. -static void ensureToBufferOpIsValid(Value tensor, Type memrefType) { -#ifndef NDEBUG - auto rankedTensorType = llvm::dyn_cast(tensor.getType()); - assert((!rankedTensorType || llvm::cast(memrefType).getRank() == - rankedTensorType.getRank()) && - "to_buffer would be invalid: mismatching ranks"); -#endif -} - FailureOr bufferization::getBuffer(RewriterBase &rewriter, Value value, const BufferizationOptions &options, const BufferizationState &state) { @@ -708,7 +698,7 @@ FailureOr bufferization::getBuffer(RewriterBase &rewriter, Value value, FailureOr bufferType = getBufferType(value, options, state); if (failed(bufferType)) return failure(); - ensureToBufferOpIsValid(value, *bufferType); + return bufferization::ToBufferOp::create(rewriter, value.getLoc(), *bufferType, value) .getResult(); diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationDialect.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationDialect.cpp index d6c3cd62ee742..bd177ba1afccd 100644 --- a/mlir/lib/Dialect/Bufferization/IR/BufferizationDialect.cpp +++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationDialect.cpp @@ -54,9 +54,6 @@ struct BuiltinTensorExternalModel mlir::LogicalResult verifyCompatibleBufferType( mlir::Type tensor, BufferLikeType bufferType, llvm::function_ref emitError) const { - assert(isa(tensor) && "expected tensor type"); - assert(isa(bufferType) && "expected memref type"); - auto tensorType = cast(tensor); auto memrefType = cast(bufferType); diff --git a/mlir/test/Dialect/Bufferization/invalid.mlir b/mlir/test/Dialect/Bufferization/invalid.mlir index 2c8807b66de74..9884b040119d0 100644 --- a/mlir/test/Dialect/Bufferization/invalid.mlir +++ b/mlir/test/Dialect/Bufferization/invalid.mlir @@ -127,3 +127,63 @@ func.func @invalid_manual_deallocation() { // expected-error @below{{op attribute 'bufferization.manual_deallocation' can be used only on ops that have an allocation and/or free side effect}} arith.constant {bufferization.manual_deallocation} 0 : index } + +// ----- + +func.func @invalid_rank_to_buffer(%t: tensor<1x2x3x4xf32>) { + // expected-error @below{{'bufferization.to_buffer' op failed to verify that specified tensor and buffer types match}} + // expected-error @below{{shapes do not match}} + %b = bufferization.to_buffer %t + : tensor<1x2x3x4xf32> to memref<1x2x3xf32> + return +} + +// ----- + +func.func @invalid_rank_to_tensor(%b: memref<1x2x3xf32>) { + // expected-error @below{{'bufferization.to_tensor' op failed to verify that specified tensor and buffer types match}} + // expected-error @below{{shapes do not match}} + %t = bufferization.to_tensor %b + : memref<1x2x3xf32> to tensor<1x2x3x4xf32> + return +} + +// ----- + +func.func @invalid_shape_to_buffer(%t: tensor<1x2x3x4xf32>) { + // expected-error @below{{'bufferization.to_buffer' op failed to verify that specified tensor and buffer types match}} + // expected-error @below{{shapes do not match}} + %b = bufferization.to_buffer %t + : tensor<1x2x3x4xf32> to memref<1x2x4x3xf32> + return +} + +// ----- + +func.func @invalid_shape_to_tensor(%b: memref<1x2x4x3xf32>) { + // expected-error @below{{'bufferization.to_tensor' op failed to verify that specified tensor and buffer types match}} + // expected-error @below{{shapes do not match}} + %t = bufferization.to_tensor %b + : memref<1x2x4x3xf32> to tensor<1x2x3x4xf32> + return +} + +// ----- + +func.func @invalid_type_to_buffer(%t: tensor<1x2x3x4xf32>) { + // expected-error @below{{'bufferization.to_buffer' op failed to verify that specified tensor and buffer types match}} + // expected-error @below{{element types do not match}} + %b = bufferization.to_buffer %t + : tensor<1x2x3x4xf32> to memref<1x2x3x4xf16> + return +} + +// ----- + +func.func @invalid_type_to_tensor(%b: memref<1x2x3x4xf16>) { + // expected-error @below{{'bufferization.to_tensor' op failed to verify that specified tensor and buffer types match}} + // expected-error @below{{element types do not match}} + %t2 = bufferization.to_tensor %b + : memref<1x2x3x4xf16> to tensor<1x2x3x4xf32> + return +} diff --git a/mlir/test/Dialect/Bufferization/ops.mlir b/mlir/test/Dialect/Bufferization/ops.mlir index fc6df4a09f706..b0db1bb2d0389 100644 --- a/mlir/test/Dialect/Bufferization/ops.mlir +++ b/mlir/test/Dialect/Bufferization/ops.mlir @@ -83,3 +83,40 @@ func.func @test_dealloc_op(%arg0: memref<2xf32>, %arg1: memref<4xi32>, bufferization.dealloc return %0#0, %0#1 : i1, i1 } + +// CHECK: func.func @test_builtin_custom_builtin_type_conversion +// CHECK-SAME: (%[[t:.*]]: tensor<42xf32>) -> tensor<42xf32> +func.func @test_builtin_custom_builtin_type_conversion(%t: tensor<42xf32>) + -> tensor<42xf32> { + // CHECK: %[[buffer:.*]] = bufferization.to_buffer %[[t]] + // CHECK-SAME: to !test.test_memref<[42], f32> + %buffer = bufferization.to_buffer %t + : tensor<42xf32> to !test.test_memref<[42], f32> + + // CHECK: %[[tensor:.*]] = bufferization.to_tensor %[[buffer]] + // CHECK-SAME: to tensor<42xf32> + %tensor = bufferization.to_tensor %buffer + : !test.test_memref<[42], f32> to tensor<42xf32> + + // CHECK: return %[[tensor]] + return %tensor : tensor<42xf32> +} + +// CHECK: func.func @test_custom_builtin_custom_type_conversion +// CHECK-SAME: (%[[t:.*]]: !test.test_tensor<[42], f32>) +// CHECK-SAME: -> !test.test_tensor<[42], f32> +func.func @test_custom_builtin_custom_type_conversion(%t: !test.test_tensor<[42], f32>) + -> !test.test_tensor<[42], f32> { + // CHECK: %[[buffer:.*]] = bufferization.to_buffer %[[t]] + // CHECK-SAME: to memref<42xf32> + %buffer = bufferization.to_buffer %t + : !test.test_tensor<[42], f32> to memref<42xf32> + + // CHECK: %[[tensor:.*]] = bufferization.to_tensor %[[buffer]] + // CHECK-SAME: to !test.test_tensor<[42], f32> + %tensor = bufferization.to_tensor %buffer + : memref<42xf32> to !test.test_tensor<[42], f32> + + // CHECK: return %[[tensor]] + return %tensor : !test.test_tensor<[42], f32> +} diff --git a/mlir/test/lib/Dialect/Test/TestTypes.cpp b/mlir/test/lib/Dialect/Test/TestTypes.cpp index 614121f1d43dd..9cf64a896d28a 100644 --- a/mlir/test/lib/Dialect/Test/TestTypes.cpp +++ b/mlir/test/lib/Dialect/Test/TestTypes.cpp @@ -569,11 +569,17 @@ TestTensorType::getBufferType( ::mlir::LogicalResult TestTensorType::verifyCompatibleBufferType( ::mlir::bufferization::BufferLikeType bufferType, ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError) { - auto testMemref = dyn_cast(bufferType); - if (!testMemref) - return emitError() << "expected TestMemrefType"; + if (auto testMemref = dyn_cast(bufferType)) { + const bool valid = getShape() == testMemref.getShape() && + getElementType() == testMemref.getElementType(); + return mlir::success(valid); + } + + if (auto builtinMemref = dyn_cast(bufferType)) { + const bool valid = getShape() == builtinMemref.getShape() && + getElementType() == builtinMemref.getElementType(); + return mlir::success(valid); + } - const bool valid = getShape() == testMemref.getShape() && - getElementType() == testMemref.getElementType(); - return mlir::success(valid); + return emitError() << "expected MemRefType or TestMemrefType"; } From f9256ca6cc8636347a73c028a1e30596aa27ac89 Mon Sep 17 00:00:00 2001 From: woruyu <1214539920@qq.com> Date: Tue, 18 Nov 2025 18:23:39 +0800 Subject: [PATCH 20/35] [Headers][X86] Allow AVX512 masked arithmetic ss/sd intrinsics to be used in constexpr (#162816) This PR just resolves ss/sd part of AVX512 masked arithmetic intrinsics of #160559. --- clang/include/clang/Basic/BuiltinsX86.td | 6 +- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 29 +++++++ clang/lib/AST/ExprConstant.cpp | 25 +++++++ clang/lib/Headers/avx10_2bf16intrin.h | 4 +- clang/lib/Headers/avx512fintrin.h | 64 ++++++++-------- clang/lib/Headers/avx512fp16intrin.h | 75 ++++++++----------- clang/test/CodeGen/X86/avx10_2bf16-builtins.c | 8 +- clang/test/CodeGen/X86/avx512f-builtins.c | 36 +++++++++ clang/test/CodeGen/X86/avx512fp16-builtins.c | 14 ++++ 9 files changed, 178 insertions(+), 83 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index a656fe341c8e0..7a14c6ec21a1a 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -4117,15 +4117,15 @@ let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVecto def selectpd_512 : X86Builtin<"_Vector<8, double>(unsigned char, _Vector<8, double>, _Vector<8, double>)">; } -let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512fp16", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def selectsh_128 : X86Builtin<"_Vector<8, _Float16>(unsigned char, _Vector<8, _Float16>, _Vector<8, _Float16>)">; } -let Features = "avx512bf16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512bf16", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def selectsbf_128 : X86Builtin<"_Vector<8, __bf16>(unsigned char, _Vector<8, __bf16>, _Vector<8, __bf16>)">; } -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def selectss_128 : X86Builtin<"_Vector<4, float>(unsigned char, _Vector<4, float>, _Vector<4, float>)">; def selectsd_128 : X86Builtin<"_Vector<2, double>(unsigned char, _Vector<2, double>, _Vector<2, double>)">; } diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 30426565407ba..5a96320e12b6f 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2838,6 +2838,30 @@ static bool interp__builtin_select(InterpState &S, CodePtr OpPC, return true; } +/// Scalar variant of AVX512 predicated select: +/// Result[i] = (Mask bit 0) ? LHS[i] : RHS[i], but only element 0 may change. +/// All other elements are taken from RHS. +static bool interp__builtin_select_scalar(InterpState &S, + const CallExpr *Call) { + unsigned N = + Call->getArg(1)->getType()->getAs()->getNumElements(); + + const Pointer &W = S.Stk.pop(); + const Pointer &A = S.Stk.pop(); + APSInt U = popToAPSInt(S, Call->getArg(0)); + const Pointer &Dst = S.Stk.peek(); + + bool TakeA0 = U.getZExtValue() & 1ULL; + + for (unsigned I = TakeA0; I != N; ++I) + Dst.elem(I) = W.elem(I); + if (TakeA0) + Dst.elem(0) = A.elem(0); + + Dst.initializeAllElements(); + return true; +} + static bool interp__builtin_blend(InterpState &S, CodePtr OpPC, const CallExpr *Call) { APSInt Mask = popToAPSInt(S, Call->getArg(2)); @@ -4151,6 +4175,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return APInt::getAllOnes(DstBits); }); + case clang::X86::BI__builtin_ia32_selectss_128: + case clang::X86::BI__builtin_ia32_selectsd_128: + case clang::X86::BI__builtin_ia32_selectsh_128: + case clang::X86::BI__builtin_ia32_selectsbf_128: + return interp__builtin_select_scalar(S, Call); case clang::X86::BI__builtin_ia32_vprotbi: case clang::X86::BI__builtin_ia32_vprotdi: case clang::X86::BI__builtin_ia32_vprotqi: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index ed1f1b7508ffc..74f6e3acb6b39 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12202,6 +12202,24 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), SourceLen), E); }; + auto EvalSelectScalar = [&](unsigned Len) -> bool { + APSInt Mask; + APValue AVal, WVal; + if (!EvaluateInteger(E->getArg(0), Mask, Info) || + !EvaluateAsRValue(Info, E->getArg(1), AVal) || + !EvaluateAsRValue(Info, E->getArg(2), WVal)) + return false; + + bool TakeA0 = (Mask.getZExtValue() & 1u) != 0; + SmallVector Res; + Res.reserve(Len); + Res.push_back(TakeA0 ? AVal.getVectorElt(0) : WVal.getVectorElt(0)); + for (unsigned I = 1; I < Len; ++I) + Res.push_back(WVal.getVectorElt(I)); + APValue V(Res.data(), Res.size()); + return Success(V, E); + }; + switch (E->getBuiltinCallee()) { default: return false; @@ -12505,6 +12523,13 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return APInt((Src).trunc(DstBits)); return APInt::getAllOnes(DstBits); }); + case clang::X86::BI__builtin_ia32_selectss_128: + return EvalSelectScalar(4); + case clang::X86::BI__builtin_ia32_selectsd_128: + return EvalSelectScalar(2); + case clang::X86::BI__builtin_ia32_selectsh_128: + case clang::X86::BI__builtin_ia32_selectsbf_128: + return EvalSelectScalar(8); case clang::X86::BI__builtin_ia32_pmuldq128: case clang::X86::BI__builtin_ia32_pmuldq256: case clang::X86::BI__builtin_ia32_pmuldq512: diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h index 9f5b726d7b789..3df6930f94be3 100644 --- a/clang/lib/Headers/avx10_2bf16intrin.h +++ b/clang/lib/Headers/avx10_2bf16intrin.h @@ -221,12 +221,12 @@ static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_move_sbh(__m128bh __a, return __a; } -static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +static __inline__ __m128bh __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_move_sbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { return __builtin_ia32_selectsbf_128(__U, _mm_move_sbh(__A, __B), __W); } -static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +static __inline__ __m128bh __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_move_sbh(__mmask8 __U, __m128bh __A, __m128bh __B) { return __builtin_ia32_selectsbf_128(__U, _mm_move_sbh(__A, __B), _mm_setzero_pbh()); diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 53b18df764370..531c23a210586 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -1820,14 +1820,14 @@ _mm512_maskz_abs_epi32(__mmask16 __U, __m512i __A) { (__v16si)_mm512_setzero_si512()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_add_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_add_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, __W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_add_ss(__mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_add_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } @@ -1850,14 +1850,14 @@ _mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) { (__v4sf)_mm_setzero_ps(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_add_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_add_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, __W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_add_sd(__mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_add_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); } @@ -1935,14 +1935,14 @@ _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ (__v16sf)_mm512_setzero_ps())) -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_sub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_sub_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, __W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_sub_ss(__mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_sub_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } @@ -1964,14 +1964,14 @@ _mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) { (__v4sf)_mm_setzero_ps(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_sub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_sub_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, __W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_sub_sd(__mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_sub_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); } @@ -2050,14 +2050,14 @@ _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ (__v16sf)_mm512_setzero_ps())) -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_mul_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_mul_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, __W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_mul_ss(__mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_mul_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } @@ -2079,14 +2079,14 @@ _mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) { (__v4sf)_mm_setzero_ps(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_mul_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_mul_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, __W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_mul_sd(__mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_mul_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); } @@ -2165,14 +2165,14 @@ _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ (__v16sf)_mm512_setzero_ps())) -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_div_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_div_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, __W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_div_ss(__mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_div_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } @@ -2195,14 +2195,14 @@ _mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) { (__v4sf)_mm_setzero_ps(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_div_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_div_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, __W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_div_sd(__mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_div_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); } diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h index 2776450387480..4c6bf3a3fa968 100644 --- a/clang/lib/Headers/avx512fp16intrin.h +++ b/clang/lib/Headers/avx512fp16intrin.h @@ -588,23 +588,20 @@ _mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) { (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_add_sh(__m128h __A, __m128h __B) { __A[0] += __B[0]; return __A; } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_add_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_add_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, __W); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_add_sh(__mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_add_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); } @@ -624,23 +621,20 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U, (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A, - __m128h __B) { +static __inline__ __m128h + __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_sub_sh(__m128h __A, __m128h __B) { __A[0] -= __B[0]; return __A; } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_sub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_sub_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, __W); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_sub_sh(__mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_sub_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); } @@ -660,23 +654,20 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U, (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A, - __m128h __B) { +static __inline__ __m128h + __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mul_sh(__m128h __A, __m128h __B) { __A[0] *= __B[0]; return __A; } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_mul_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_mul_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, __W); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_mul_sh(__mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_mul_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); } @@ -696,23 +687,20 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U, (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A, - __m128h __B) { +static __inline__ __m128h + __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_div_sh(__m128h __A, __m128h __B) { __A[0] /= __B[0]; return __A; } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_div_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_div_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, __W); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_div_sh(__mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_div_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); } @@ -960,22 +948,19 @@ static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P, } // moves with vmovsh: -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a, - __m128h __b) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_move_sh(__m128h __a, __m128h __b) { __a[0] = __b[0]; return __a; } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_move_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_move_sh(__mmask8 __U, __m128h __A, __m128h __B) { return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), _mm_setzero_ph()); } diff --git a/clang/test/CodeGen/X86/avx10_2bf16-builtins.c b/clang/test/CodeGen/X86/avx10_2bf16-builtins.c index f8a4c51d9ceb3..fac7ef2e2bf29 100644 --- a/clang/test/CodeGen/X86/avx10_2bf16-builtins.c +++ b/clang/test/CodeGen/X86/avx10_2bf16-builtins.c @@ -1,7 +1,11 @@ // RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64 -target-feature +avx10.2 -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror | FileCheck %s -// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i386 -target-feature +avx10.2 -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i386 -target-feature +avx10.2 -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror | FileCheck %s + +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64 -target-feature +avx10.2 -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i386 -target-feature +avx10.2 -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s #include +#include "builtin_test_helpers.h" __m256bh test_mm256_setzero_pbh() { // CHECK-LABEL: @test_mm256_setzero_pbh @@ -353,6 +357,7 @@ __m128bh test_mm_move_sbh(__m128bh A, __m128bh B) { // CHECK: insertelement <8 x bfloat> %{{.*}}, bfloat %{{.*}}, i32 0 return _mm_move_sbh(A, B); } +TEST_CONSTEXPR(match_m128bh(_mm_move_sbh((__m128bh)(__v8bf){1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f}, (__m128bh)(__v8bf){9.0f,10.0f,11.0f,12.0f,13.0f,14.0f,15.0f,16.0f}), 9.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f)); __m128bh test_mm_mask_move_sbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { // CHECK-LABEL: @test_mm_mask_move_sbh @@ -366,6 +371,7 @@ __m128bh test_mm_mask_move_sbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128b // CHECK-NEXT: insertelement <8 x bfloat> [[VEC]], bfloat [[SEL]], i64 0 return _mm_mask_move_sbh(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_m128bh(_mm_mask_move_sbh((__m128bh)(__v8bf){1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f}, (__mmask8)0x01, (__m128bh)(__v8bf){100.0f,200.0f,300.0f,400.0f,500.0f,600.0f,700.0f,800.0f}, (__m128bh)(__v8bf){9.0f,10.0f,11.0f,12.0f,13.0f,14.0f,15.0f,16.0f}), 9.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f)); __m128bh test_mm_maskz_move_sbh(__mmask8 __U, __m128bh __A, __m128bh __B) { // CHECK-LABEL: @test_mm_maskz_move_sbh diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index ec813e5acd7cf..7e62a7d92890f 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -3302,6 +3302,8 @@ __m128 test_mm_mask_add_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_mask_add_ss(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_mask_add_ss((__m128)(__v4sf){10.0f, 100.0f, 200.0f, 300.0f}, 0x1,(__m128)(__v4sf){1.25f, 3.0f, 4.0f, 5.0f},(__m128)(__v4sf){2.75f, 6.0f, 7.0f, 8.0f}),4.0f, 100.0f, 200.0f, 300.0f)); + __m128 test_mm_maskz_add_ss(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_maskz_add_ss // CHECK-NOT: @llvm.x86.avx512.mask.add.ss.round @@ -3317,6 +3319,8 @@ __m128 test_mm_maskz_add_ss(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_maskz_add_ss(__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_maskz_add_ss(0x1, (__m128)(__v4sf){1.25f, 3.0f, 4.0f, 5.0f}, (__m128)(__v4sf){2.75f, 6.0f, 7.0f, 8.0f}), 4.0f, 0.0f, 0.0f, 0.0f)); + __m128d test_mm_add_round_sd(__m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_add_round_sd // CHECK: @llvm.x86.avx512.mask.add.sd.round @@ -3347,6 +3351,8 @@ __m128d test_mm_mask_add_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_mask_add_sd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_mask_add_sd((__m128d)(__v2df){10.0, 999.0}, 0x1, (__m128d)(__v2df){5.5, 77.0}, (__m128d)(__v2df){0.25, 88.0}), 5.75, 999.0)); + __m128d test_mm_maskz_add_sd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_maskz_add_sd // CHECK-NOT: @llvm.x86.avx512.mask.add.sd.round @@ -3362,6 +3368,8 @@ __m128d test_mm_maskz_add_sd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_maskz_add_sd(__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_maskz_add_sd(0x1, (__m128d)(__v2df){5.5, 77.0}, (__m128d)(__v2df){0.25, 88.0}), 5.75, 0.0)); + __m512d test_mm512_sub_round_pd(__m512d __A, __m512d __B) { // CHECK-LABEL: test_mm512_sub_round_pd // CHECK: @llvm.x86.avx512.sub.pd.512 @@ -3450,6 +3458,8 @@ __m128 test_mm_mask_sub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_mask_sub_ss(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_mask_sub_ss((__m128)(__v4sf){-1.0f, 10.0f, 20.0f, 30.0f}, 0x1, (__m128)(__v4sf){7.0f, 3.0f, 4.0f, 5.0f}, (__m128)(__v4sf){2.5f, 6.0f, 7.0f, 8.0f}), 4.5f, 10.0f, 20.0f, 30.0f)); + __m128 test_mm_maskz_sub_ss(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_maskz_sub_ss // CHECK-NOT: @llvm.x86.avx512.mask.sub.ss.round @@ -3465,6 +3475,8 @@ __m128 test_mm_maskz_sub_ss(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_maskz_sub_ss(__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_maskz_sub_ss(0x1, (__m128)(__v4sf){7.0f, 3.0f, 4.0f, 5.0f}, (__m128)(__v4sf){2.5f, 6.0f, 7.0f, 8.0f}), 4.5f, 0.0f, 0.0f, 0.0f)); + __m128d test_mm_sub_round_sd(__m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_sub_round_sd // CHECK: @llvm.x86.avx512.mask.sub.sd.round @@ -3495,6 +3507,8 @@ __m128d test_mm_mask_sub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_mask_sub_sd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_mask_sub_sd((__m128d)(__v2df){-1.0, 111.0}, 0x1, (__m128d)(__v2df){9.0, 70.0}, (__m128d)(__v2df){3.5, 80.0}), 5.5, 111.0)); + __m128d test_mm_maskz_sub_sd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_maskz_sub_sd // CHECK-NOT: @llvm.x86.avx512.mask.sub.sd.round @@ -3510,6 +3524,8 @@ __m128d test_mm_maskz_sub_sd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_maskz_sub_sd(__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_maskz_sub_sd(0x1, (__m128d)(__v2df){9.0, 70.0}, (__m128d)(__v2df){3.5, 80.0}), 5.5, 0.0)); + __m512d test_mm512_mul_round_pd(__m512d __A, __m512d __B) { // CHECK-LABEL: test_mm512_mul_round_pd // CHECK: @llvm.x86.avx512.mul.pd.512 @@ -3598,6 +3614,8 @@ __m128 test_mm_mask_mul_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_mask_mul_ss(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_mask_mul_ss((__m128)(__v4sf){42.0f, -1.0f, -2.0f, -3.0f}, 0x1, (__m128)(__v4sf){6.0f, 9.0f, 9.0f, 9.0f}, (__m128)(__v4sf){7.0f, 8.0f, 8.0f, 8.0f}), 42.0f, -1.0f, -2.0f, -3.0f)); + __m128 test_mm_maskz_mul_ss(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_maskz_mul_ss // CHECK-NOT: @llvm.x86.avx512.mask.mul.ss.round @@ -3613,6 +3631,8 @@ __m128 test_mm_maskz_mul_ss(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_maskz_mul_ss(__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_maskz_mul_ss(0x1, (__m128)(__v4sf){6.0f, 9.0f, 9.0f, 9.0f}, (__m128)(__v4sf){7.0f, 8.0f, 8.0f, 8.0f}), 42.0f, 0.0f, 0.0f, 0.0f)); + __m128d test_mm_mul_round_sd(__m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_mul_round_sd // CHECK: @llvm.x86.avx512.mask.mul.sd.round @@ -3643,6 +3663,8 @@ __m128d test_mm_mask_mul_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_mask_mul_sd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_mask_mul_sd((__m128d)(__v2df){123.0, -9.0}, 0x1, (__m128d)(__v2df){2.5, 1.0}, (__m128d)(__v2df){4.0, 2.0}), 10.0, -9.0)); + __m128d test_mm_maskz_mul_sd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_maskz_mul_sd // CHECK-NOT: @llvm.x86.avx512.mask.mul.sd.round @@ -3658,6 +3680,8 @@ __m128d test_mm_maskz_mul_sd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_maskz_mul_sd(__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_maskz_mul_sd(0x1, (__m128d)(__v2df){2.5, 1.0}, (__m128d)(__v2df){4.0, 2.0}), 10.0, 0.0)); + __m512d test_mm512_div_round_pd(__m512d __A, __m512d __B) { // CHECK-LABEL: test_mm512_div_round_pd // CHECK: @llvm.x86.avx512.div.pd.512 @@ -3757,6 +3781,8 @@ __m128 test_mm_mask_div_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_mask_div_ss(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_mask_div_ss((__m128)(__v4sf){-7.0f, 5.0f, 6.0f, 7.0f}, 0x1, (__m128)(__v4sf){9.0f, 1.0f, 1.0f, 1.0f}, (__m128)(__v4sf){3.0f, 2.0f, 2.0f, 2.0f}), 3.0f, 5.0f, 6.0f, 7.0f)); + __m128 test_mm_maskz_div_ss(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_maskz_div_ss // CHECK: extractelement <4 x float> %{{.*}}, i32 0 @@ -3771,6 +3797,8 @@ __m128 test_mm_maskz_div_ss(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_maskz_div_ss(__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_maskz_div_ss(0x1, (__m128)(__v4sf){9.0f, 1.0f, 1.0f, 1.0f}, (__m128)(__v4sf){3.0f, 2.0f, 2.0f, 2.0f}), 3.0f, 0.0f, 0.0f, 0.0f)); + __m128d test_mm_div_round_sd(__m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_div_round_sd // CHECK: @llvm.x86.avx512.mask.div.sd.round @@ -3800,6 +3828,8 @@ __m128d test_mm_mask_div_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_mask_div_sd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_mask_div_sd((__m128d)(__v2df){-8.0, 44.0}, 0x1, (__m128d)(__v2df){8.0, 10.0}, (__m128d)(__v2df){2.0, 20.0}), 4.0, 44.0)); + __m128d test_mm_maskz_div_sd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_maskz_div_sd // CHECK: extractelement <2 x double> %{{.*}}, i32 0 @@ -3814,6 +3844,8 @@ __m128d test_mm_maskz_div_sd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_maskz_div_sd(__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_maskz_div_sd(0x1, (__m128d)(__v2df){8.0, 10.0}, (__m128d)(__v2df){2.0, 20.0}), 4.0, 0.0)); + __m128 test_mm_max_round_ss(__m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_max_round_ss // CHECK: @llvm.x86.avx512.mask.max.ss.round @@ -11673,6 +11705,7 @@ __m128 test_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) // CHECK-NEXT: insertelement <4 x float> [[VEC]], float [[SEL]], i64 0 return _mm_mask_move_ss ( __W, __U, __A, __B); } +TEST_CONSTEXPR(match_m128(_mm_mask_move_ss((__m128)(__v4sf){1.0f,2.0f,3.0f,4.0f}, (__mmask8)0x01, (__m128)(__v4sf){100.0f,200.0f,300.0f,400.0f}, (__m128)(__v4sf){9.0f,10.0f,11.0f,12.0f}), 9.0f,2.0f,3.0f,4.0f)); __m128 test_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) { @@ -11687,6 +11720,7 @@ __m128 test_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) // CHECK-NEXT: insertelement <4 x float> [[VEC]], float [[SEL]], i64 0 return _mm_maskz_move_ss (__U, __A, __B); } +TEST_CONSTEXPR(match_m128(_mm_maskz_move_ss((__mmask8)0x01, (__m128)(__v4sf){0.0f,0.0f,0.0f,0.0f}, (__m128)(__v4sf){9.0f,10.0f,11.0f,12.0f}), 9.0f,0.0f,0.0f,0.0f)); __m128d test_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { @@ -11701,6 +11735,7 @@ __m128d test_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __ // CHECK-NEXT: insertelement <2 x double> [[VEC]], double [[SEL]], i64 0 return _mm_mask_move_sd ( __W, __U, __A, __B); } +TEST_CONSTEXPR(match_m128d(_mm_mask_move_sd((__m128d)(__v2df){1.0,2.0}, (__mmask8)0x01, (__m128d)(__v2df){100.0,200.0}, (__m128d)(__v2df){9.0,10.0}), 9.0,2.0)); __m128d test_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) { @@ -11715,6 +11750,7 @@ __m128d test_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) // CHECK-NEXT: insertelement <2 x double> [[VEC]], double [[SEL]], i64 0 return _mm_maskz_move_sd (__U, __A, __B); } +TEST_CONSTEXPR(match_m128d(_mm_maskz_move_sd((__mmask8)0x01, (__m128d)(__v2df){0.0,0.0}, (__m128d)(__v2df){9.0,10.0}), 9.0,0.0)); void test_mm_mask_store_ss(float * __P, __mmask8 __U, __m128 __A) { diff --git a/clang/test/CodeGen/X86/avx512fp16-builtins.c b/clang/test/CodeGen/X86/avx512fp16-builtins.c index f0a0a3b28542f..1c8ab8ca52099 100644 --- a/clang/test/CodeGen/X86/avx512fp16-builtins.c +++ b/clang/test/CodeGen/X86/avx512fp16-builtins.c @@ -796,6 +796,8 @@ __m128h test_mm_mask_add_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 return _mm_mask_add_sh(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_m128h(_mm_mask_add_sh((__m128h)(__v8hf){1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f},(__mmask8)0x01,(__m128h)(__v8hf){10.0f,20.0f,30.0f,40.0f,50.0f,60.0f,70.0f,80.0f},(__m128h)(__v8hf){100.0f,200.0f,300.0f,400.0f,500.0f,600.0f,700.0f,800.0f}),110.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f)); + __m128h test_mm_maskz_add_sh(__mmask8 __U, __m128h __A, __m128h __B) { // CHECK-LABEL: test_mm_maskz_add_sh // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 @@ -810,6 +812,7 @@ __m128h test_mm_maskz_add_sh(__mmask8 __U, __m128h __A, __m128h __B) { // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 return _mm_maskz_add_sh(__U, __A, __B); } +TEST_CONSTEXPR(match_m128h(_mm_maskz_add_sh((__mmask8)0x01,(__m128h)(__v8hf){10.0f,20.0f,30.0f,40.0f,50.0f,60.0f,70.0f,80.0f},(__m128h)(__v8hf){100.0f,200.0f,300.0f,400.0f,500.0f,600.0f,700.0f,800.0f}),110.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f)); __m128h test_mm_add_sh(__m128h __A, __m128h __B) { // CHECK-LABEL: test_mm_add_sh @@ -849,6 +852,8 @@ __m128h test_mm_mask_sub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 return _mm_mask_sub_sh(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_m128h(_mm_mask_sub_sh((__m128h)(__v8hf){1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f},(__mmask8)0x01,(__m128h)(__v8hf){20.0f,21.0f,22.0f,23.0f,24.0f,25.0f,26.0f,27.0f},(__m128h)(__v8hf){5.0f,6.0f,7.0f,8.0f,9.0f,10.0f,11.0f,12.0f}),15.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f)); + __m128h test_mm_maskz_sub_sh(__mmask8 __U, __m128h __A, __m128h __B) { // CHECK-LABEL: test_mm_maskz_sub_sh // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 @@ -863,6 +868,7 @@ __m128h test_mm_maskz_sub_sh(__mmask8 __U, __m128h __A, __m128h __B) { // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 return _mm_maskz_sub_sh(__U, __A, __B); } +TEST_CONSTEXPR(match_m128h(_mm_maskz_sub_sh((__mmask8)0x01,(__m128h)(__v8hf){20.0f,21.0f,22.0f,23.0f,24.0f,25.0f,26.0f,27.0f},(__m128h)(__v8hf){5.0f,6.0f,7.0f,8.0f,9.0f,10.0f,11.0f,12.0f}),15.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f)); __m128h test_mm_sub_sh(__m128h __A, __m128h __B) { // CHECK-LABEL: test_mm_sub_sh @@ -902,6 +908,8 @@ __m128h test_mm_mask_mul_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 return _mm_mask_mul_sh(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_m128h(_mm_mask_mul_sh((__m128h)(__v8hf){1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f},(__mmask8)0x01,(__m128h)(__v8hf){3.0f,4.0f,5.0f,6.0f,7.0f,8.0f,9.0f,10.0f},(__m128h)(__v8hf){4.0f,5.0f,6.0f,7.0f,8.0f,9.0f,10.0f,11.0f}),12.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f)); + __m128h test_mm_maskz_mul_sh(__mmask8 __U, __m128h __A, __m128h __B) { // CHECK-LABEL: test_mm_maskz_mul_sh // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 @@ -916,6 +924,7 @@ __m128h test_mm_maskz_mul_sh(__mmask8 __U, __m128h __A, __m128h __B) { // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 return _mm_maskz_mul_sh(__U, __A, __B); } +TEST_CONSTEXPR(match_m128h(_mm_maskz_mul_sh((__mmask8)0x01,(__m128h)(__v8hf){3.0f,4.0f,5.0f,6.0f,7.0f,8.0f,9.0f,10.0f},(__m128h)(__v8hf){4.0f,5.0f,6.0f,7.0f,8.0f,9.0f,10.0f,11.0f}),12.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f)); __m128h test_mm_mul_sh(__m128h __A, __m128h __B) { // CHECK-LABEL: test_mm_mul_sh @@ -955,6 +964,8 @@ __m128h test_mm_mask_div_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 return _mm_mask_div_sh(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_m128h(_mm_mask_div_sh((__m128h)(__v8hf){1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f},(__mmask8)0x01,(__m128h)(__v8hf){8.0f,9.0f,10.0f,11.0f,12.0f,13.0f,14.0f,15.0f},(__m128h)(__v8hf){4.0f,3.0f,2.0f,1.0f,2.0f,3.0f,4.0f,5.0f}),2.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f)); + __m128h test_mm_maskz_div_sh(__mmask8 __U, __m128h __A, __m128h __B) { // CHECK-LABEL: test_mm_maskz_div_sh // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 @@ -969,6 +980,7 @@ __m128h test_mm_maskz_div_sh(__mmask8 __U, __m128h __A, __m128h __B) { // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 return _mm_maskz_div_sh(__U, __A, __B); } +TEST_CONSTEXPR(match_m128h(_mm_maskz_div_sh((__mmask8)0x01,(__m128h)(__v8hf){8.0f,9.0f,10.0f,11.0f,12.0f,13.0f,14.0f,15.0f},(__m128h)(__v8hf){4.0f,3.0f,2.0f,1.0f,2.0f,3.0f,4.0f,5.0f}),2.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f)); __m128h test_mm_div_sh(__m128h __A, __m128h __B) { // CHECK-LABEL: test_mm_div_sh @@ -1622,6 +1634,7 @@ __m128h test_mm_mask_move_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B // CHECK-NEXT: insertelement <8 x half> [[VEC]], half [[SEL]], i64 0 return _mm_mask_move_sh(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_m128h(_mm_mask_move_sh((__m128h)(__v8hf){1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f}, (__mmask8)0x01, (__m128h)(__v8hf){10.0f,20.0f,30.0f,40.0f,50.0f,60.0f,70.0f,80.0f}, (__m128h)(__v8hf){100.0f,200.0f,300.0f,400.0f,500.0f,600.0f,700.0f,800.0f}), 100.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f)); __m128h test_mm_maskz_move_sh(__mmask8 __U, __m128h __A, __m128h __B) { // CHECK-LABEL: test_mm_maskz_move_sh @@ -1635,6 +1648,7 @@ __m128h test_mm_maskz_move_sh(__mmask8 __U, __m128h __A, __m128h __B) { // CHECK-NEXT: insertelement <8 x half> [[VEC]], half [[SEL]], i64 0 return _mm_maskz_move_sh(__U, __A, __B); } +TEST_CONSTEXPR(match_m128h(_mm_maskz_move_sh((__mmask8)0x01, (__m128h)(__v8hf){10.0f,20.0f,30.0f,40.0f,50.0f,60.0f,70.0f,80.0f}, (__m128h)(__v8hf){100.0f,200.0f,300.0f,400.0f,500.0f,600.0f,700.0f,800.0f}), 100.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f)); short test_mm_cvtsi128_si16(__m128i A) { // CHECK-LABEL: test_mm_cvtsi128_si16 From 2ea1a09244bc870499b316e8576c526a6e04b644 Mon Sep 17 00:00:00 2001 From: woruyu <1214539920@qq.com> Date: Tue, 18 Nov 2025 18:32:03 +0800 Subject: [PATCH 21/35] [Headers][X86] Allow AVX512 masked arithmetic pd/ps/epi/epu intrinsics to be used in constexpr (#168496) ### Summary This PR resolves #160559 - other pd/ps/epi/epu part of AVX512 masked arithmetic intrinsics. --- clang/lib/Headers/avx512fintrin.h | 52 +++++------ clang/lib/Headers/avx512vlintrin.h | 104 ++++++++++----------- clang/test/CodeGen/X86/avx512f-builtins.c | 36 +++++++ clang/test/CodeGen/X86/avx512vl-builtins.c | 73 ++++++++++++++- 4 files changed, 180 insertions(+), 85 deletions(-) diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 531c23a210586..e4184795e47e9 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -1369,17 +1369,15 @@ _mm512_mul_epi32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, (__v8di)_mm512_mul_epi32(__X, __Y), (__v8di)__W); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, (__v8di)_mm512_mul_epi32(__X, __Y), (__v8di)_mm512_setzero_si512 ()); @@ -1390,17 +1388,15 @@ _mm512_mul_epu32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, (__v8di)_mm512_mul_epu32(__X, __Y), (__v8di)__W); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, (__v8di)_mm512_mul_epu32(__X, __Y), (__v8di)_mm512_setzero_si512 ()); @@ -1879,28 +1875,28 @@ _mm_maskz_add_sd(__mmask8 __U, __m128d __A, __m128d __B) { (__v2df)_mm_setzero_pd(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_add_pd(__A, __B), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_add_pd(__A, __B), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_add_ps(__A, __B), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_add_ps(__A, __B), @@ -1994,28 +1990,28 @@ _mm_maskz_sub_sd(__mmask8 __U, __m128d __A, __m128d __B) { (__v2df)_mm_setzero_pd(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_sub_pd(__A, __B), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_sub_pd(__A, __B), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_sub_ps(__A, __B), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_sub_ps(__A, __B), @@ -2109,28 +2105,28 @@ _mm_maskz_mul_sd(__mmask8 __U, __m128d __A, __m128d __B) { (__v2df)_mm_setzero_pd(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_mul_pd(__A, __B), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_mul_pd(__A, __B), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_mul_ps(__A, __B), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_mul_ps(__A, __B), @@ -2230,14 +2226,14 @@ static __inline __m512d return (__m512d)((__v8df)__a/(__v8df)__b); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_div_pd(__A, __B), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_div_pd(__A, __B), @@ -2249,14 +2245,14 @@ _mm512_div_ps(__m512 __a, __m512 __b) { return (__m512)((__v16sf)__a/(__v16sf)__b); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_div_ps(__A, __B), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_div_ps(__A, __B), diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h index 1e6e42df6b5fb..5a1b540e07e3a 100644 --- a/clang/lib/Headers/avx512vlintrin.h +++ b/clang/lib/Headers/avx512vlintrin.h @@ -347,65 +347,57 @@ _mm_maskz_sub_epi64(__mmask8 __U, __m128i __A, __m128i __B) { (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, (__v4di)_mm256_mul_epi32(__X, __Y), (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, (__v4di)_mm256_mul_epi32(__X, __Y), (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, (__v2di)_mm_mul_epi32(__X, __Y), (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, (__v2di)_mm_mul_epi32(__X, __Y), (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, (__v4di)_mm256_mul_epu32(__X, __Y), (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, (__v4di)_mm256_mul_epu32(__X, __Y), (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, (__v2di)_mm_mul_epu32(__X, __Y), (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, (__v2di)_mm_mul_epu32(__X, __Y), (__v2di)_mm_setzero_si128()); @@ -1426,56 +1418,56 @@ _mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) (__v8sf) __C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_add_pd(__A, __B), (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_add_pd(__A, __B), (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_add_pd(__A, __B), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_add_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_add_ps(__A, __B), (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_add_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_add_ps(__A, __B), (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_add_ps(__A, __B), @@ -2202,56 +2194,56 @@ _mm256_maskz_cvtepu32_ps(__mmask8 __U, __m256i __A) { (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_div_pd(__A, __B), (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_div_pd(__A, __B), (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_div_pd(__A, __B), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_div_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_div_ps(__A, __B), (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_div_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_div_ps(__A, __B), (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_div_ps(__A, __B), @@ -2717,56 +2709,56 @@ _mm256_maskz_min_ps(__mmask8 __U, __m256 __A, __m256 __B) { (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_mul_pd(__A, __B), (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_mul_pd(__A, __B), (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_mul_pd(__A, __B), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_mul_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_mul_ps(__A, __B), (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_mul_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_mul_ps(__A, __B), (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_mul_ps(__A, __B), @@ -3500,56 +3492,56 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8sf)_mm256_setzero_ps()); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_sub_pd(__A, __B), (__v2df)__W); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_sub_pd(__A, __B), (__v2df)_mm_setzero_pd()); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_sub_pd(__A, __B), (__v4df)__W); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_sub_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_sub_ps(__A, __B), (__v4sf)__W); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_sub_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_sub_ps(__A, __B), (__v8sf)__W); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_sub_ps(__A, __B), diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index 7e62a7d92890f..eb25aa538e9a3 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -3137,6 +3137,7 @@ __m512i test_mm512_maskz_mul_epi32 (__mmask8 __k,__m512i __A, __m512i __B) { //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_maskz_mul_epi32(__k,__A,__B); } +TEST_CONSTEXPR(match_m512i(_mm512_maskz_mul_epi32((__mmask8)0b11110000, (__m512i){1, 2, 3, 4, 5, 6, 7, 8}, (__m512i){10, 20, 30, 40, 50, 60, 70, 80}), 0, 0, 0, 0, 250, 360, 490, 640)); __m512i test_mm512_mask_mul_epi32 (__mmask8 __k,__m512i __A, __m512i __B, __m512i __src) { //CHECK-LABEL: test_mm512_mask_mul_epi32 @@ -3148,6 +3149,7 @@ __m512i test_mm512_mask_mul_epi32 (__mmask8 __k,__m512i __A, __m512i __B, __m512 //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_mask_mul_epi32(__src,__k,__A,__B); } +TEST_CONSTEXPR(match_m512i(_mm512_mask_mul_epi32((__m512i){1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000}, (__mmask8)0b11110000, (__m512i){1, 2, 3, 4, 5, 6, 7, 8}, (__m512i){10, 20, 30, 40, 50, 60, 70, 80}), 1000, 2000, 3000, 4000, 250, 360, 490, 640)); __m512i test_mm512_mul_epu32 (__m512i __A, __m512i __B) { //CHECK-LABEL: test_mm512_mul_epu32 @@ -3166,6 +3168,7 @@ __m512i test_mm512_maskz_mul_epu32 (__mmask8 __k,__m512i __A, __m512i __B) { //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_maskz_mul_epu32(__k,__A,__B); } +TEST_CONSTEXPR(match_m512i(_mm512_maskz_mul_epu32((__mmask8)0b11110000, (__m512i){1, 2, 3, 4, 5, 6, 7, 8}, (__m512i){10, 20, 30, 40, 50, 60, 70, 80}), 0, 0, 0, 0, 250, 360, 490, 640)); __m512i test_mm512_mask_mul_epu32 (__mmask8 __k,__m512i __A, __m512i __B, __m512i __src) { //CHECK-LABEL: test_mm512_mask_mul_epu32 @@ -3175,6 +3178,7 @@ __m512i test_mm512_mask_mul_epu32 (__mmask8 __k,__m512i __A, __m512i __B, __m512 //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_mask_mul_epu32(__src,__k,__A,__B); } +TEST_CONSTEXPR(match_m512i(_mm512_mask_mul_epu32((__m512i){1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000}, (__mmask8)0b11110000, (__m512i){1, 2, 3, 4, 5, 6, 7, 8}, (__m512i){10, 20, 30, 40, 50, 60, 70, 80}), 1000, 2000, 3000, 4000, 250, 360, 490, 640)); __m512i test_mm512_maskz_mullo_epi32 (__mmask16 __k,__m512i __A, __m512i __B) { //CHECK-LABEL: test_mm512_maskz_mullo_epi32 @@ -3237,12 +3241,16 @@ __m512d test_mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d _ // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_add_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m512d(_mm512_mask_add_pd((__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, (__mmask8)0b11110000, (__m512d){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}, (__m512d){100.0, 200.0, 300.0, 400.0, 500.0, 600.0, 700.0, 800.0}), 1.0, 2.0, 3.0, 4.0, 550.0, 660.0, 770.0, 880.0)); + __m512d test_mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: test_mm512_maskz_add_pd // CHECK: fadd <8 x double> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_add_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m512d(_mm512_maskz_add_pd((__mmask8)0b11110000, (__m512d){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}, (__m512d){100.0, 200.0, 300.0, 400.0, 500.0, 600.0, 700.0, 800.0}), 0.0, 0.0, 0.0, 0.0, 550.0, 660.0, 770.0, 880.0)); + __m512 test_mm512_add_round_ps(__m512 __A, __m512 __B) { // CHECK-LABEL: test_mm512_add_round_ps // CHECK: @llvm.x86.avx512.add.ps.512 @@ -3266,12 +3274,16 @@ __m512 test_mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_add_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m512(_mm512_mask_add_ps((__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}, (__mmask16)0b1111111100000000, (__m512){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f, 110.0f, 120.0f, 130.0f, 140.0f, 150.0f, 160.0f}, (__m512){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f, 900.0f, 1000.0f, 1100.0f, 1200.0f, 1300.0f, 1400.0f, 1500.0f, 1600.0f}), 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 990.0f, 1100.0f, 1210.0f, 1320.0f, 1430.0f, 1540.0f, 1650.0f, 1760.0f)); + __m512 test_mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: test_mm512_maskz_add_ps // CHECK: fadd <16 x float> %{{.*}}, %{{.*}} // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_add_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m512(_mm512_maskz_add_ps((__mmask16)0b1111111100000000, (__m512){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f, 110.0f, 120.0f, 130.0f, 140.0f, 150.0f, 160.0f}, (__m512){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f, 900.0f, 1000.0f, 1100.0f, 1200.0f, 1300.0f, 1400.0f, 1500.0f, 1600.0f}), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 990.0f, 1100.0f, 1210.0f, 1320.0f, 1430.0f, 1540.0f, 1650.0f, 1760.0f)); + __m128 test_mm_add_round_ss(__m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_add_round_ss // CHECK: @llvm.x86.avx512.mask.add.ss.round @@ -3393,12 +3405,16 @@ __m512d test_mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d _ // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_sub_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m512d(_mm512_mask_sub_pd((__m512d){100.0, 200.0, 300.0, 400.0, 500.0, 600.0, 700.0, 800.0}, (__mmask8)0b11110000, (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, (__m512d){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 100.0, 200.0, 300.0, 400.0, -45.0, -54.0, -63.0, -72.0)); + __m512d test_mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: test_mm512_maskz_sub_pd // CHECK: fsub <8 x double> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_sub_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m512d(_mm512_maskz_sub_pd((__mmask8)0b11110000, (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, (__m512d){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 0.0, 0.0, 0.0, 0.0, -45.0, -54.0, -63.0, -72.0)); + __m512 test_mm512_sub_round_ps(__m512 __A, __m512 __B) { // CHECK-LABEL: test_mm512_sub_round_ps // CHECK: @llvm.x86.avx512.sub.ps.512 @@ -3422,12 +3438,16 @@ __m512 test_mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_sub_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m512(_mm512_mask_sub_ps((__m512){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f, 900.0f, 1000.0f, 1100.0f, 1200.0f, 1300.0f, 1400.0f, 1500.0f, 1600.0f}, (__mmask16)0b1111111100000000, (__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}, (__m512){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f, 110.0f, 120.0f, 130.0f, 140.0f, 150.0f, 160.0f}), 100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f, -81.0f, -90.0f, -99.0f, -108.0f, -117.0f, -126.0f, -135.0f, -144.0f)); + __m512 test_mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: test_mm512_maskz_sub_ps // CHECK: fsub <16 x float> %{{.*}}, %{{.*}} // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_sub_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m512(_mm512_maskz_sub_ps((__mmask16)0b1111111100000000, (__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}, (__m512){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f, 110.0f, 120.0f, 130.0f, 140.0f, 150.0f, 160.0f}), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -81.0f, -90.0f, -99.0f, -108.0f, -117.0f, -126.0f, -135.0f, -144.0f)); + __m128 test_mm_sub_round_ss(__m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_sub_round_ss // CHECK: @llvm.x86.avx512.mask.sub.ss.round @@ -3549,12 +3569,16 @@ __m512d test_mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d _ // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_mul_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m512d(_mm512_mask_mul_pd((__m512d){100.0, 200.0, 300.0, 400.0, 500.0, 600.0, 700.0, 800.0}, (__mmask8)0b11110000, (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, (__m512d){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 100.0, 200.0, 300.0, 400.0, 250.0, 360.0, 490.0, 640.0)); + __m512d test_mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: test_mm512_maskz_mul_pd // CHECK: fmul <8 x double> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_mul_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m512d(_mm512_maskz_mul_pd((__mmask8)0b11110000, (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, (__m512d){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 0.0, 0.0, 0.0, 0.0, 250.0, 360.0, 490.0, 640.0)); + __m512 test_mm512_mul_round_ps(__m512 __A, __m512 __B) { // CHECK-LABEL: test_mm512_mul_round_ps // CHECK: @llvm.x86.avx512.mul.ps.512 @@ -3578,12 +3602,16 @@ __m512 test_mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_mul_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m512(_mm512_mask_mul_ps((__m512){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f, 900.0f, 1000.0f, 1100.0f, 1200.0f, 1300.0f, 1400.0f, 1500.0f, 1600.0f}, (__mmask16)0b1111111100000000, (__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}, (__m512){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f, 110.0f, 120.0f, 130.0f, 140.0f, 150.0f, 160.0f}), 100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f, 810.0f, 1000.0f, 1210.0f, 1440.0f, 1690.0f, 1960.0f, 2250.0f, 2560.0f)); + __m512 test_mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: test_mm512_maskz_mul_ps // CHECK: fmul <16 x float> %{{.*}}, %{{.*}} // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_mul_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m512(_mm512_maskz_mul_ps((__mmask16)0b1111111100000000, (__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}, (__m512){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f, 110.0f, 120.0f, 130.0f, 140.0f, 150.0f, 160.0f}), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 810.0f, 1000.0f, 1210.0f, 1440.0f, 1690.0f, 1960.0f, 2250.0f, 2560.0f)); + __m128 test_mm_mul_round_ss(__m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_mul_round_ss // CHECK: @llvm.x86.avx512.mask.mul.ss.round @@ -3711,12 +3739,16 @@ __m512d test_mm512_mask_div_pd(__m512d __w, __mmask8 __u, __m512d __a, __m512d _ // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_div_pd(__w,__u,__a,__b); } +TEST_CONSTEXPR(match_m512d(_mm512_mask_div_pd((__m512d){100.0, 200.0, 300.0, 400.0, 500.0, 600.0, 700.0, 800.0}, (__mmask8)0b11110000, (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, (__m512d){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 100.0, 200.0, 300.0, 400.0, 0.1, 0.1, 0.1, 0.1)); + __m512d test_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: test_mm512_maskz_div_pd // CHECK: fdiv <8 x double> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_div_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m512d(_mm512_maskz_div_pd((__mmask8)0b11110000, (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, (__m512d){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 0.0, 0.0, 0.0, 0.0, 0.1, 0.1, 0.1, 0.1)); + __m512 test_mm512_div_round_ps(__m512 __A, __m512 __B) { // CHECK-LABEL: test_mm512_div_round_ps // CHECK: @llvm.x86.avx512.div.ps.512 @@ -3746,12 +3778,16 @@ __m512 test_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_div_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m512(_mm512_mask_div_ps((__m512){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f, 900.0f, 1000.0f, 1100.0f, 1200.0f, 1300.0f, 1400.0f, 1500.0f, 1600.0f}, (__mmask16)0b1111111100000000, (__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}, (__m512){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f, 110.0f, 120.0f, 130.0f, 140.0f, 150.0f, 160.0f}), 100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f)); + __m512 test_mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: test_mm512_maskz_div_ps // CHECK: fdiv <16 x float> %{{.*}}, %{{.*}} // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_div_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m512(_mm512_maskz_div_ps((__mmask16)0b1111111100000000, (__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}, (__m512){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f, 110.0f, 120.0f, 130.0f, 140.0f, 150.0f, 160.0f}), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f)); + __m128 test_mm_div_round_ss(__m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_div_round_ss // CHECK: @llvm.x86.avx512.mask.div.ss.round diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index a7eee79c97539..e05b1ddf7b69a 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -770,6 +770,7 @@ __m256i test_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X, //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask_mul_epi32(__W, __M, __X, __Y); } +TEST_CONSTEXPR(match_m256i(_mm256_mask_mul_epi32((__m256i){100,200,300,400}, (__mmask8)0b00001100, (__m256i){1,2,3,4}, (__m256i){10,20,30,40}), 100,200,90,160)); __m256i test_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) { //CHECK-LABEL: test_mm256_maskz_mul_epi32 @@ -781,7 +782,7 @@ __m256i test_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) { //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_maskz_mul_epi32(__M, __X, __Y); } - +TEST_CONSTEXPR(match_m256i(_mm256_maskz_mul_epi32((__mmask8)0b00001100, (__m256i){1,2,3,4}, (__m256i){10,20,30,40}), 0,0,90,160)); __m128i test_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { @@ -794,6 +795,7 @@ __m128i test_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X, //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_mask_mul_epi32(__W, __M, __X, __Y); } +TEST_CONSTEXPR(match_m128i(_mm_mask_mul_epi32((__m128i){100,200}, (__mmask8)0b00000001, (__m128i){1,2}, (__m128i){10,20}), 10,200)); __m128i test_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y) { //CHECK-LABEL: test_mm_maskz_mul_epi32 @@ -805,6 +807,7 @@ __m128i test_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y) { //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_maskz_mul_epi32(__M, __X, __Y); } +TEST_CONSTEXPR(match_m128i(_mm_maskz_mul_epi32((__mmask8)0b00000010, (__m128i){1,2}, (__m128i){10,20}), 0,40)); __m256i test_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { @@ -815,6 +818,7 @@ __m256i test_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X, //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask_mul_epu32(__W, __M, __X, __Y); } +TEST_CONSTEXPR(match_m256i(_mm256_mask_mul_epu32((__m256i){100,200,300,400}, (__mmask8)0b00001100, (__m256i){1,2,3,4}, (__m256i){10,20,30,40}), 100,200,90,160)); __m256i test_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y) { //CHECK-LABEL: test_mm256_maskz_mul_epu32 @@ -824,6 +828,7 @@ __m256i test_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y) { //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_maskz_mul_epu32(__M, __X, __Y); } +TEST_CONSTEXPR(match_m256i(_mm256_maskz_mul_epu32((__mmask8)0b00001100, (__m256i){1,2,3,4}, (__m256i){10,20,30,40}), 0,0,90,160)); __m128i test_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { @@ -834,6 +839,7 @@ __m128i test_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X, //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_mask_mul_epu32(__W, __M, __X, __Y); } +TEST_CONSTEXPR(match_m128i(_mm_mask_mul_epu32((__m128i){100,200}, (__mmask8)0b00000001, (__m128i){1,2}, (__m128i){10,20}), 10,200)); __m128i test_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y) { //CHECK-LABEL: test_mm_maskz_mul_epu32 @@ -843,6 +849,7 @@ __m128i test_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y) { //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_maskz_mul_epu32(__M, __X, __Y); } +TEST_CONSTEXPR(match_m128i(_mm_maskz_mul_epu32((__mmask8)0b00000010, (__m128i){1,2}, (__m128i){10,20}), 0,40)); __m128i test_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B) { //CHECK-LABEL: test_mm_maskz_mullo_epi32 @@ -3606,48 +3613,64 @@ __m128d test_mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_add_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_mask_add_pd((__m128d){1.0, 2.0}, (__mmask8)0b00000001, (__m128d){10.0, 20.0}, (__m128d){100.0, 200.0}), 110.0, 2.0)); + __m128d test_mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_maskz_add_pd // CHECK: fadd <2 x double> %{{.*}}, %{{.*}} // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_add_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_maskz_add_pd((__mmask8)0b00000001, (__m128d){10.0, 20.0}, (__m128d){100.0, 200.0}), 110.0, 0.0)); + __m256d test_mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_mask_add_pd // CHECK: fadd <4 x double> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_add_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m256d(_mm256_mask_add_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__mmask8)0b00001100, (__m256d){10.0, 20.0, 30.0, 40.0}, (__m256d){100.0, 200.0, 300.0, 400.0}), 1.0, 2.0, 330.0, 440.0)); + __m256d test_mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_maskz_add_pd // CHECK: fadd <4 x double> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_add_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m256d(_mm256_maskz_add_pd((__mmask8)0b00001100, (__m256d){10.0, 20.0, 30.0, 40.0}, (__m256d){100.0, 200.0, 300.0, 400.0}), 0.0, 0.0, 330.0, 440.0)); + __m128 test_mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_mask_add_ps // CHECK: fadd <4 x float> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_add_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_mask_add_ps((__m128){1.0f, 2.0f, 3.0f, 4.0f}, (__mmask8)0b00001010, (__m128){10.0f, 20.0f, 30.0f, 40.0f}, (__m128){100.0f, 200.0f, 300.0f, 400.0f}), 1.0f, 220.0f, 3.0f, 440.0f)); + __m128 test_mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_maskz_add_ps // CHECK: fadd <4 x float> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_add_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_maskz_add_ps((__mmask8)0b00001010, (__m128){10.0f, 20.0f, 30.0f, 40.0f}, (__m128){100.0f, 200.0f, 300.0f, 400.0f}), 0.0f, 220.0f, 0.0f, 440.0f)); + __m256 test_mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: test_mm256_mask_add_ps // CHECK: fadd <8 x float> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_add_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m256(_mm256_mask_add_ps((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, (__mmask8)0b11110000, (__m256){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f}, (__m256){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f}), 1.0f, 2.0f, 3.0f, 4.0f, 550.0f, 660.0f, 770.0f, 880.0f)); + __m256 test_mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: test_mm256_maskz_add_ps // CHECK: fadd <8 x float> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_add_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m256(_mm256_maskz_add_ps((__mmask8)0b11110000, (__m256){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f}, (__m256){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f}), 0.0f, 0.0f, 0.0f, 0.0f, 550.0f, 660.0f, 770.0f, 880.0f)); + __m128i test_mm_mask_blend_epi32(__mmask8 __U, __m128i __A, __m128i __W) { // CHECK-LABEL: test_mm_mask_blend_epi32 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} @@ -4352,48 +4375,64 @@ __m128d test_mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_div_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_mask_div_pd((__m128d){1.0, 2.0}, (__mmask8)0b00000001, (__m128d){10.0, 20.0}, (__m128d){100.0, 200.0}), 0.1, 2.0)); + __m128d test_mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_maskz_div_pd // CHECK: fdiv <2 x double> %{{.*}}, %{{.*}} // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_div_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_maskz_div_pd((__mmask8)0b00000001, (__m128d){10.0, 20.0}, (__m128d){100.0, 200.0}), 0.1, 0.0)); + __m256d test_mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_mask_div_pd // CHECK: fdiv <4 x double> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_div_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m256d(_mm256_mask_div_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__mmask8)0b00001100, (__m256d){10.0, 20.0, 30.0, 40.0}, (__m256d){100.0, 200.0, 300.0, 400.0}), 1.0, 2.0, 0.1, 0.1)); + __m256d test_mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_maskz_div_pd // CHECK: fdiv <4 x double> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_div_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m256d(_mm256_maskz_div_pd((__mmask8)0b00001100, (__m256d){10.0, 20.0, 30.0, 40.0}, (__m256d){100.0, 200.0, 300.0, 400.0}), 0.0, 0.0, 0.1, 0.1)); + __m128 test_mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_mask_div_ps // CHECK: fdiv <4 x float> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_div_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_mask_div_ps((__m128){1.0f, 2.0f, 3.0f, 4.0f}, (__mmask8)0b00001010, (__m128){10.0f, 20.0f, 30.0f, 40.0f}, (__m128){100.0f, 200.0f, 300.0f, 400.0f}), 1.0f, 0.1f, 3.0f, 0.1f)); + __m128 test_mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_maskz_div_ps // CHECK: fdiv <4 x float> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_div_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_maskz_div_ps((__mmask8)0b00001010, (__m128){10.0f, 20.0f, 30.0f, 40.0f}, (__m128){100.0f, 200.0f, 300.0f, 400.0f}), 0.0f, 0.1f, 0.0f, 0.1f)); + __m256 test_mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: test_mm256_mask_div_ps // CHECK: fdiv <8 x float> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_div_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m256(_mm256_mask_div_ps((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, (__mmask8)0b11110000, (__m256){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f}, (__m256){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f}), 1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.1f, 0.1f, 0.1f)); + __m256 test_mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: test_mm256_maskz_div_ps // CHECK: fdiv <8 x float> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_div_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m256(_mm256_maskz_div_ps((__mmask8)0b11110000, (__m256){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f}, (__m256){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f}), 0.0f, 0.0f, 0.0f, 0.0f, 0.1f, 0.1f, 0.1f, 0.1f)); + __m128d test_mm_mask_expand_pd(__m128d __W, __mmask8 __U, __m128d __A) { // CHECK-LABEL: test_mm_mask_expand_pd // CHECK: @llvm.x86.avx512.mask.expand @@ -4716,48 +4755,64 @@ __m128d test_mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_mul_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_mask_mul_pd((__m128d){1.0, 2.0}, (__mmask8)0b00000001, (__m128d){10.0, 20.0}, (__m128d){100.0, 200.0}), 1000.0, 2.0)); + __m128d test_mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_maskz_mul_pd // CHECK: fmul <2 x double> %{{.*}}, %{{.*}} // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_mul_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_maskz_mul_pd((__mmask8)0b00000001, (__m128d){10.0, 20.0}, (__m128d){100.0, 200.0}), 1000.0, 0.0)); + __m256d test_mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_mask_mul_pd // CHECK: fmul <4 x double> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_mul_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m256d(_mm256_mask_mul_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__mmask8)0b00001100, (__m256d){10.0, 20.0, 30.0, 40.0}, (__m256d){100.0, 200.0, 300.0, 400.0}), 1.0, 2.0, 9000.0, 16000.0)); + __m256d test_mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_maskz_mul_pd // CHECK: fmul <4 x double> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_mul_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m256d(_mm256_maskz_mul_pd((__mmask8)0b00001100, (__m256d){10.0, 20.0, 30.0, 40.0}, (__m256d){100.0, 200.0, 300.0, 400.0}), 0.0, 0.0, 9000.0, 16000.0)); + __m128 test_mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_mask_mul_ps // CHECK: fmul <4 x float> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_mul_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_mask_mul_ps((__m128){1.0f, 2.0f, 3.0f, 4.0f}, (__mmask8)0b00001010, (__m128){10.0f, 20.0f, 30.0f, 40.0f}, (__m128){100.0f, 200.0f, 300.0f, 400.0f}), 1.0f, 4000.0f, 3.0f, 16000.0f)); + __m128 test_mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_maskz_mul_ps // CHECK: fmul <4 x float> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_mul_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_maskz_mul_ps((__mmask8)0b00001010, (__m128){10.0f, 20.0f, 30.0f, 40.0f}, (__m128){100.0f, 200.0f, 300.0f, 400.0f}), 0.0f, 4000.0f, 0.0f, 16000.0f)); + __m256 test_mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: test_mm256_mask_mul_ps // CHECK: fmul <8 x float> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_mul_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m256(_mm256_mask_mul_ps((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, (__mmask8)0b11110000, (__m256){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f}, (__m256){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f}), 1.0f, 2.0f, 3.0f, 4.0f, 25000.0f, 36000.0f, 49000.0f, 64000.0f)); + __m256 test_mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: test_mm256_maskz_mul_ps // CHECK: fmul <8 x float> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_mul_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m256(_mm256_maskz_mul_ps((__mmask8)0b11110000, (__m256){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f}, (__m256){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f}), 0.0f, 0.0f, 0.0f, 0.0f, 25000.0f, 36000.0f, 49000.0f, 64000.0f)); + __m128i test_mm_mask_abs_epi32(__m128i __W, __mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_mask_abs_epi32 // CHECK: [[ABS:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %{{.*}}, i1 false) @@ -5562,48 +5617,64 @@ __m128d test_mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_sub_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_mask_sub_pd((__m128d){1.0, 2.0}, (__mmask8)0b00000001, (__m128d){10.0, 20.0}, (__m128d){100.0, 200.0}), -90.0, 2.0)); + __m128d test_mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_maskz_sub_pd // CHECK: fsub <2 x double> %{{.*}}, %{{.*}} // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_sub_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m128d(_mm_maskz_sub_pd((__mmask8)0b00000001, (__m128d){10.0, 20.0}, (__m128d){100.0, 200.0}), -90.0, 0.0)); + __m256d test_mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_mask_sub_pd // CHECK: fsub <4 x double> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_sub_pd(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m256d(_mm256_mask_sub_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__mmask8)0b00001100, (__m256d){10.0, 20.0, 30.0, 40.0}, (__m256d){100.0, 200.0, 300.0, 400.0}), 1.0, 2.0, -270.0, -360.0)); + __m256d test_mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_maskz_sub_pd // CHECK: fsub <4 x double> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_sub_pd(__U,__A,__B); } +TEST_CONSTEXPR(match_m256d(_mm256_maskz_sub_pd((__mmask8)0b00001100, (__m256d){10.0, 20.0, 30.0, 40.0}, (__m256d){100.0, 200.0, 300.0, 400.0}), 0.0, 0.0, -270.0, -360.0)); + __m128 test_mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_mask_sub_ps // CHECK: fsub <4 x float> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_sub_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_mask_sub_ps((__m128){1.0f, 2.0f, 3.0f, 4.0f}, (__mmask8)0b00001010, (__m128){10.0f, 20.0f, 30.0f, 40.0f}, (__m128){100.0f, 200.0f, 300.0f, 400.0f}), 1.0f, -180.0f, 3.0f, -360.0f)); + __m128 test_mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_maskz_sub_ps // CHECK: fsub <4 x float> %{{.*}}, %{{.*}} // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_sub_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m128(_mm_maskz_sub_ps((__mmask8)0b00001010, (__m128){10.0f, 20.0f, 30.0f, 40.0f}, (__m128){100.0f, 200.0f, 300.0f, 400.0f}), 0.0f, -180.0f, 0.0f, -360.0f)); + __m256 test_mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: test_mm256_mask_sub_ps // CHECK: fsub <8 x float> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_sub_ps(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_m256(_mm256_mask_sub_ps((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, (__mmask8)0b11110000, (__m256){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f}, (__m256){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f}), 1.0f, 2.0f, 3.0f, 4.0f, -450.0f, -540.0f, -630.0f, -720.0f)); + __m256 test_mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: test_mm256_maskz_sub_ps // CHECK: fsub <8 x float> %{{.*}}, %{{.*}} // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_sub_ps(__U,__A,__B); } +TEST_CONSTEXPR(match_m256(_mm256_maskz_sub_ps((__mmask8)0b11110000, (__m256){10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f}, (__m256){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f}), 0.0f, 0.0f, 0.0f, 0.0f, -450.0f, -540.0f, -630.0f, -720.0f)); + __m128i test_mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U, __m128i __B) { // CHECK-LABEL: test_mm_mask2_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.128 From 672757bf556eecff11f8e6af64e8298023d4722f Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Tue, 18 Nov 2025 02:41:16 -0800 Subject: [PATCH 22/35] [WebAssembly] Add patterns for extadd pairwise (#167960) Add a few patterns for extadd pairwise. --- .../WebAssembly/WebAssemblyInstrSIMD.td | 26 ++++++ llvm/test/CodeGen/WebAssembly/simd-extadd.ll | 89 +++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 llvm/test/CodeGen/WebAssembly/simd-extadd.ll diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 14097d7b40a9c..0bdddcffd723d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1541,6 +1541,32 @@ def : Pat<(v4i32 (int_wasm_extadd_pairwise_signed (v8i16 V128:$in))), def : Pat<(v8i16 (int_wasm_extadd_pairwise_signed (v16i8 V128:$in))), (extadd_pairwise_s_I16x8 V128:$in)>; +multiclass ExtAddPairwiseShuffle { + foreach sign = ["s", "u"] in { + def : Pat<(to_ty (add + (!cast("extend_low_"#sign) (from_ty (wasm_shuffle (from_ty V128:$vec), (from_ty srcvalue), + (i32 a0), (i32 a1), (i32 a2), (i32 a3), + (i32 a4), (i32 a5), (i32 a6), (i32 a7), + (i32 srcvalue), (i32 srcvalue), (i32 srcvalue), (i32 srcvalue), + (i32 srcvalue), (i32 srcvalue), (i32 srcvalue), (i32 srcvalue)))), + (!cast("extend_low_"#sign) (from_ty (wasm_shuffle (from_ty V128:$vec), (from_ty srcvalue), + (i32 b0), (i32 b1), (i32 b2), (i32 b3), + (i32 b4), (i32 b5), (i32 b6), (i32 b7), + (i32 srcvalue), (i32 srcvalue), (i32 srcvalue), (i32 srcvalue), + (i32 srcvalue), (i32 srcvalue), (i32 srcvalue), (i32 srcvalue)))))), + (!cast("extadd_pairwise_"#sign#"_"#suffix) V128:$vec)>; + } +} + +defm : ExtAddPairwiseShuffle; +defm : ExtAddPairwiseShuffle; + // f64x2 <-> f32x4 conversions def demote_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>; def demote_zero : SDNode<"WebAssemblyISD::DEMOTE_ZERO", demote_t>; diff --git a/llvm/test/CodeGen/WebAssembly/simd-extadd.ll b/llvm/test/CodeGen/WebAssembly/simd-extadd.ll new file mode 100644 index 0000000000000..dfc47a6abf03a --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-extadd.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -O2 -mtriple=wasm32 -mattr=+simd128 | FileCheck %s + +target triple = "wasm32-unknown-unknown" + +; Test that adding two extended shuffles from the same vector that ends w/ an add converts to extadd_pairwise + +define <8 x i16> @test_extadd_pairwise_i8x16_s(<16 x i8> %v) { +; CHECK-LABEL: test_extadd_pairwise_i8x16_s: +; CHECK: .functype test_extadd_pairwise_i8x16_s (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.extadd_pairwise_i8x16_s +; CHECK-NEXT: # fallthrough-return + %even = shufflevector <16 x i8> %v, <16 x i8> poison, <8 x i32> + %odd = shufflevector <16 x i8> %v, <16 x i8> poison, <8 x i32> + %even_ext = sext <8 x i8> %even to <8 x i16> + %odd_ext = sext <8 x i8> %odd to <8 x i16> + %result = add <8 x i16> %even_ext, %odd_ext + ret <8 x i16> %result +} + +define <8 x i16> @test_extadd_pairwise_i8x16_u(<16 x i8> %v) { +; CHECK-LABEL: test_extadd_pairwise_i8x16_u: +; CHECK: .functype test_extadd_pairwise_i8x16_u (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.extadd_pairwise_i8x16_u +; CHECK-NEXT: # fallthrough-return + %even = shufflevector <16 x i8> %v, <16 x i8> poison, <8 x i32> + %odd = shufflevector <16 x i8> %v, <16 x i8> poison, <8 x i32> + %even_ext = zext <8 x i8> %even to <8 x i16> + %odd_ext = zext <8 x i8> %odd to <8 x i16> + %result = add <8 x i16> %even_ext, %odd_ext + ret <8 x i16> %result +} + +define <4 x i32> @test_extadd_pairwise_i16x8_s(<8 x i16> %v) { +; CHECK-LABEL: test_extadd_pairwise_i16x8_s: +; CHECK: .functype test_extadd_pairwise_i16x8_s (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.extadd_pairwise_i16x8_s +; CHECK-NEXT: # fallthrough-return + %even = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> + %odd = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> + %even_ext = sext <4 x i16> %even to <4 x i32> + %odd_ext = sext <4 x i16> %odd to <4 x i32> + %result = add <4 x i32> %even_ext, %odd_ext + ret <4 x i32> %result +} + +define <4 x i32> @test_extadd_pairwise_i16x8_u(<8 x i16> %v) { +; CHECK-LABEL: test_extadd_pairwise_i16x8_u: +; CHECK: .functype test_extadd_pairwise_i16x8_u (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.extadd_pairwise_i16x8_u +; CHECK-NEXT: # fallthrough-return + %even = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> + %odd = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> + %even_ext = zext <4 x i16> %even to <4 x i32> + %odd_ext = zext <4 x i16> %odd to <4 x i32> + %result = add <4 x i32> %even_ext, %odd_ext + ret <4 x i32> %result +} + +; Negative test: shuffling mask doesn't fit pattern +define <4 x i32> @negative_test_extadd_pairwise_i16x8_u(<8 x i16> %v) { +; CHECK-LABEL: negative_test_extadd_pairwise_i16x8_u: +; CHECK: .functype negative_test_extadd_pairwise_i16x8_u (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.shuffle 0, 1, 6, 7, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK-NEXT: i32x4.extend_low_i16x8_u +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1 +; CHECK-NEXT: i32x4.extend_low_i16x8_u +; CHECK-NEXT: i32x4.add +; CHECK-NEXT: # fallthrough-return + %even = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> + %odd = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> + %even_ext = zext <4 x i16> %even to <4 x i32> + %odd_ext = zext <4 x i16> %odd to <4 x i32> + %result = add <4 x i32> %even_ext, %odd_ext + ret <4 x i32> %result +} From 3ce893f83450fd487710c5a319bd62b851a32291 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= Date: Tue, 18 Nov 2025 11:44:00 +0100 Subject: [PATCH 23/35] [ORC] Move DebugObjectManagerPlugin into Debugging/ELFDebugObjectPlugin (NFC) (#168343) In 4 years the plugin wasn't adapted to other object formats. This patch makes it specific for ELF, which will allow to remove some abstractions down the line. It also moves the plugin from LLVMOrcJIT into LLVMOrcDebugging, which didn't exist back then. --- clang/lib/Interpreter/IncrementalExecutor.cpp | 1 - .../RemoteJITUtils.cpp | 3 +- .../Orc/Debugging/DebuggerSupportPlugin.h | 2 +- .../ELFDebugObjectPlugin.h} | 16 ++++----- llvm/lib/ExecutionEngine/Orc/CMakeLists.txt | 1 - .../Orc/Debugging/CMakeLists.txt | 1 + .../Orc/Debugging/DebuggerSupport.cpp | 6 ++-- .../ELFDebugObjectPlugin.cpp} | 34 +++++++++---------- llvm/tools/llvm-jitlink/llvm-jitlink.cpp | 6 ++-- .../llvm/lib/ExecutionEngine/Orc/BUILD.gn | 2 +- 10 files changed, 34 insertions(+), 38 deletions(-) rename llvm/include/llvm/ExecutionEngine/Orc/{DebugObjectManagerPlugin.h => Debugging/ELFDebugObjectPlugin.h} (87%) rename llvm/lib/ExecutionEngine/Orc/{DebugObjectManagerPlugin.cpp => Debugging/ELFDebugObjectPlugin.cpp} (94%) diff --git a/clang/lib/Interpreter/IncrementalExecutor.cpp b/clang/lib/Interpreter/IncrementalExecutor.cpp index 45620fcd358c8..74a489f4b3ac9 100644 --- a/clang/lib/Interpreter/IncrementalExecutor.cpp +++ b/clang/lib/Interpreter/IncrementalExecutor.cpp @@ -18,7 +18,6 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ExecutionEngine/ExecutionEngine.h" #include "llvm/ExecutionEngine/Orc/CompileUtils.h" -#include "llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h" #include "llvm/ExecutionEngine/Orc/Debugging/DebuggerSupport.h" #include "llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h" #include "llvm/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.h" diff --git a/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/RemoteJITUtils.cpp b/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/RemoteJITUtils.cpp index 83c5899852d64..6e2aaf32325a9 100644 --- a/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/RemoteJITUtils.cpp +++ b/llvm/examples/OrcV2Examples/LLJITWithRemoteDebugging/RemoteJITUtils.cpp @@ -9,8 +9,7 @@ #include "RemoteJITUtils.h" #include "llvm/ADT/StringExtras.h" -#include "llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h" -#include "llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h" +#include "llvm/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.h" #include "llvm/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.h" #include "llvm/ExecutionEngine/Orc/Shared/SimpleRemoteEPCUtils.h" #include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h" diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.h b/llvm/include/llvm/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.h index 3ca3afa122836..1581f7aca211e 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.h @@ -24,7 +24,7 @@ namespace orc { /// For each object containing debug info, installs JITLink passes to synthesize /// a debug object and then register it via the GDB JIT-registration interface. /// -/// Currently MachO only. For ELF use DebugObjectManagerPlugin. These two +/// Currently MachO only. For ELF use ELFDebugObjectPlugin. These two /// plugins will be merged in the near future. class LLVM_ABI GDBJITDebugInfoRegistrationPlugin : public ObjectLinkingLayer::Plugin { diff --git a/llvm/include/llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h b/llvm/include/llvm/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.h similarity index 87% rename from llvm/include/llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h rename to llvm/include/llvm/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.h index 1988403715f57..d946a029fd2ec 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.h @@ -1,4 +1,4 @@ -//===---- DebugObjectManagerPlugin.h - JITLink debug objects ---*- C++ -*-===// +//===------ ELFDebugObjectPlugin.h - JITLink debug objects ------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -10,8 +10,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_EXECUTIONENGINE_ORC_DEBUGOBJECTMANAGERPLUGIN_H -#define LLVM_EXECUTIONENGINE_ORC_DEBUGOBJECTMANAGERPLUGIN_H +#ifndef LLVM_EXECUTIONENGINE_ORC_ELFDEBUGOBJECTPLUGIN_H +#define LLVM_EXECUTIONENGINE_ORC_ELFDEBUGOBJECTPLUGIN_H #include "llvm/ExecutionEngine/JITLink/JITLink.h" #include "llvm/ExecutionEngine/Orc/Core.h" @@ -46,7 +46,7 @@ class DebugObject; /// DebugObjectRegistrar is notified. Ownership of DebugObjects remains with the /// plugin. /// -class LLVM_ABI DebugObjectManagerPlugin : public ObjectLinkingLayer::Plugin { +class LLVM_ABI ELFDebugObjectPlugin : public ObjectLinkingLayer::Plugin { public: /// Create the plugin to submit DebugObjects for JITLink artifacts. For all /// options the recommended setting is true. @@ -63,9 +63,9 @@ class LLVM_ABI DebugObjectManagerPlugin : public ObjectLinkingLayer::Plugin { /// sequence. When turning this off, the user has to issue the call to /// __jit_debug_register_code() on the executor side manually. /// - DebugObjectManagerPlugin(ExecutionSession &ES, bool RequireDebugSections, - bool AutoRegisterCode, Error &Err); - ~DebugObjectManagerPlugin() override; + ELFDebugObjectPlugin(ExecutionSession &ES, bool RequireDebugSections, + bool AutoRegisterCode, Error &Err); + ~ELFDebugObjectPlugin() override; void notifyMaterializing(MaterializationResponsibility &MR, jitlink::LinkGraph &G, jitlink::JITLinkContext &Ctx, @@ -99,4 +99,4 @@ class LLVM_ABI DebugObjectManagerPlugin : public ObjectLinkingLayer::Plugin { } // namespace orc } // namespace llvm -#endif // LLVM_EXECUTIONENGINE_ORC_DEBUGOBJECTMANAGERPLUGIN_H +#endif // LLVM_EXECUTIONENGINE_ORC_ELFDEBUGOBJECTPLUGIN_H diff --git a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt index db16a3005f6c1..41402f7a69ccb 100644 --- a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt +++ b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt @@ -14,7 +14,6 @@ add_llvm_component_library(LLVMOrcJIT CompileOnDemandLayer.cpp CompileUtils.cpp Core.cpp - DebugObjectManagerPlugin.cpp DebugUtils.cpp EHFrameRegistrationPlugin.cpp EPCDynamicLibrarySearchGenerator.cpp diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/Debugging/CMakeLists.txt index 186df5dad072e..ab287c7af60be 100644 --- a/llvm/lib/ExecutionEngine/Orc/Debugging/CMakeLists.txt +++ b/llvm/lib/ExecutionEngine/Orc/Debugging/CMakeLists.txt @@ -6,6 +6,7 @@ add_llvm_component_library(LLVMOrcDebugging DebugInfoSupport.cpp DebuggerSupport.cpp DebuggerSupportPlugin.cpp + ELFDebugObjectPlugin.cpp LLJITUtilsCBindings.cpp PerfSupportPlugin.cpp VTuneSupportPlugin.cpp diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupport.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupport.cpp index 06667869b4803..7be58871ff57b 100644 --- a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupport.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupport.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include "llvm/ExecutionEngine/Orc/Debugging/DebuggerSupport.h" -#include "llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h" #include "llvm/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.h" +#include "llvm/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.h" #include "llvm/ExecutionEngine/Orc/LLJIT.h" #define DEBUG_TYPE "orc" @@ -36,8 +36,8 @@ Error enableDebuggerSupport(LLJIT &J) { switch (TT.getObjectFormat()) { case Triple::ELF: { Error TargetSymErr = Error::success(); - ObjLinkingLayer->addPlugin(std::make_unique( - ES, false, true, TargetSymErr)); + ObjLinkingLayer->addPlugin( + std::make_unique(ES, false, true, TargetSymErr)); return TargetSymErr; } case Triple::MachO: { diff --git a/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.cpp similarity index 94% rename from llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp rename to llvm/lib/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.cpp index d183134f3b769..9f556b0d07a8b 100644 --- a/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.cpp @@ -1,4 +1,4 @@ -//===------- DebugObjectManagerPlugin.cpp - JITLink debug objects ---------===// +//===------- ELFDebugObjectPlugin.cpp - JITLink debug objects ---------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -11,7 +11,7 @@ // //===----------------------------------------------------------------------===// -#include "llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h" +#include "llvm/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringMap.h" @@ -406,10 +406,9 @@ createDebugObjectFromBuffer(ExecutionSession &ES, LinkGraph &G, } } -DebugObjectManagerPlugin::DebugObjectManagerPlugin(ExecutionSession &ES, - bool RequireDebugSections, - bool AutoRegisterCode, - Error &Err) +ELFDebugObjectPlugin::ELFDebugObjectPlugin(ExecutionSession &ES, + bool RequireDebugSections, + bool AutoRegisterCode, Error &Err) : ES(ES), RequireDebugSections(RequireDebugSections), AutoRegisterCode(AutoRegisterCode) { // Pass bootstrap symbol for registration function to enable debugging @@ -418,9 +417,9 @@ DebugObjectManagerPlugin::DebugObjectManagerPlugin(ExecutionSession &ES, {{RegistrationAction, rt::RegisterJITLoaderGDBAllocActionName}}); } -DebugObjectManagerPlugin::~DebugObjectManagerPlugin() = default; +ELFDebugObjectPlugin::~ELFDebugObjectPlugin() = default; -void DebugObjectManagerPlugin::notifyMaterializing( +void ELFDebugObjectPlugin::notifyMaterializing( MaterializationResponsibility &MR, LinkGraph &G, JITLinkContext &Ctx, MemoryBufferRef ObjBuffer) { std::lock_guard Lock(PendingObjsLock); @@ -443,9 +442,9 @@ void DebugObjectManagerPlugin::notifyMaterializing( } } -void DebugObjectManagerPlugin::modifyPassConfig( - MaterializationResponsibility &MR, LinkGraph &G, - PassConfiguration &PassConfig) { +void ELFDebugObjectPlugin::modifyPassConfig(MaterializationResponsibility &MR, + LinkGraph &G, + PassConfiguration &PassConfig) { // Not all link artifacts have associated debug objects. std::lock_guard Lock(PendingObjsLock); auto It = PendingObjs.find(&MR); @@ -507,16 +506,15 @@ void DebugObjectManagerPlugin::modifyPassConfig( } } -Error DebugObjectManagerPlugin::notifyFailed( - MaterializationResponsibility &MR) { +Error ELFDebugObjectPlugin::notifyFailed(MaterializationResponsibility &MR) { std::lock_guard Lock(PendingObjsLock); PendingObjs.erase(&MR); return Error::success(); } -void DebugObjectManagerPlugin::notifyTransferringResources(JITDylib &JD, - ResourceKey DstKey, - ResourceKey SrcKey) { +void ELFDebugObjectPlugin::notifyTransferringResources(JITDylib &JD, + ResourceKey DstKey, + ResourceKey SrcKey) { // Debug objects are stored by ResourceKey only after registration. // Thus, pending objects don't need to be updated here. std::lock_guard Lock(RegisteredObjsLock); @@ -530,8 +528,8 @@ void DebugObjectManagerPlugin::notifyTransferringResources(JITDylib &JD, } } -Error DebugObjectManagerPlugin::notifyRemovingResources(JITDylib &JD, - ResourceKey Key) { +Error ELFDebugObjectPlugin::notifyRemovingResources(JITDylib &JD, + ResourceKey Key) { // Removing the resource for a pending object fails materialization, so they // get cleaned up in the notifyFailed() handler. std::lock_guard Lock(RegisteredObjsLock); diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp index 217e521b2e43e..cf5200a73e5cc 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp @@ -17,9 +17,9 @@ #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX, LLVM_ENABLE_THREADS #include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h" #include "llvm/ExecutionEngine/Orc/COFFPlatform.h" -#include "llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h" #include "llvm/ExecutionEngine/Orc/Debugging/DebugInfoSupport.h" #include "llvm/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.h" +#include "llvm/ExecutionEngine/Orc/Debugging/ELFDebugObjectPlugin.h" #include "llvm/ExecutionEngine/Orc/Debugging/PerfSupportPlugin.h" #include "llvm/ExecutionEngine/Orc/Debugging/VTuneSupportPlugin.h" #include "llvm/ExecutionEngine/Orc/EHFrameRegistrationPlugin.h" @@ -1298,8 +1298,8 @@ Session::Session(std::unique_ptr EPC, Error &Err) ObjLayer.addPlugin(ExitOnErr(EHFrameRegistrationPlugin::Create(ES))); if (DebuggerSupport) { Error TargetSymErr = Error::success(); - auto Plugin = std::make_unique(ES, true, true, - TargetSymErr); + auto Plugin = + std::make_unique(ES, true, true, TargetSymErr); if (!TargetSymErr) ObjLayer.addPlugin(std::move(Plugin)); else diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn index 0034cd9993b88..24542daed18b5 100644 --- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn @@ -22,7 +22,7 @@ static_library("Orc") { "CompileOnDemandLayer.cpp", "CompileUtils.cpp", "Core.cpp", - "DebugObjectManagerPlugin.cpp", + "ELFDebugObjectPlugin.cpp", "DebugUtils.cpp", "EHFrameRegistrationPlugin.cpp", "ELFNixPlatform.cpp", From 49d77d87d418e6e8e1a41e5ddefe74b1848da2af Mon Sep 17 00:00:00 2001 From: Evgenii Kudriashov Date: Tue, 18 Nov 2025 11:53:08 +0100 Subject: [PATCH 24/35] [X86][GlobalISel] Enable nest arguments (#165173) Nest arguments are supported by CC in X86CallingConv.td. Nothing special is required in GlobalISel as we reuse the code. Nest attribute is mostly generated by fortran frontend. --- llvm/lib/Target/X86/GISel/X86CallLowering.cpp | 3 +-- llvm/test/CodeGen/X86/isel-arg-attrs.ll | 23 +++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/X86/isel-arg-attrs.ll diff --git a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp index c0b9339e9bc34..b07ce2b958fa0 100644 --- a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp +++ b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp @@ -280,8 +280,7 @@ bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, if (Arg.hasAttribute(Attribute::ByVal) || Arg.hasAttribute(Attribute::InReg) || Arg.hasAttribute(Attribute::SwiftSelf) || - Arg.hasAttribute(Attribute::SwiftError) || - Arg.hasAttribute(Attribute::Nest) || VRegs[Idx].size() > 1) + Arg.hasAttribute(Attribute::SwiftError) || VRegs[Idx].size() > 1) return false; if (Arg.hasAttribute(Attribute::StructRet)) { diff --git a/llvm/test/CodeGen/X86/isel-arg-attrs.ll b/llvm/test/CodeGen/X86/isel-arg-attrs.ll new file mode 100644 index 0000000000000..3afee76715d6d --- /dev/null +++ b/llvm/test/CodeGen/X86/isel-arg-attrs.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=X64 + +; The src array should be in R10 or ECX register due to nest attribute +define i32 @nest_arg(ptr nest %src) { +; X86-LABEL: nest_arg: +; X86: # %bb.0: +; X86-NEXT: movl 8(%ecx), %eax +; X86-NEXT: retl +; +; X64-LABEL: nest_arg: +; X64: # %bb.0: +; X64-NEXT: movl 8(%r10), %eax +; X64-NEXT: retq + %off = getelementptr [3 x i32], ptr %src, i32 0, i32 2 + %ret = load i32, ptr %off + ret i32 %ret +} From 88465af8d922a19d7098860339859ae484aba524 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Tue, 18 Nov 2025 10:54:43 +0000 Subject: [PATCH 25/35] [gn build] Port --- llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn index 24542daed18b5..ab3b717eed69d 100644 --- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn @@ -22,7 +22,6 @@ static_library("Orc") { "CompileOnDemandLayer.cpp", "CompileUtils.cpp", "Core.cpp", - "ELFDebugObjectPlugin.cpp", "DebugUtils.cpp", "EHFrameRegistrationPlugin.cpp", "ELFNixPlatform.cpp", From 3378ea27242e0ace407c09dab3f32c17e26f4768 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Tue, 18 Nov 2025 10:54:44 +0000 Subject: [PATCH 26/35] [gn build] Port 3ce893f83450 --- .../gn/secondary/llvm/lib/ExecutionEngine/Orc/Debugging/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/Debugging/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/Debugging/BUILD.gn index 5610679ff333e..a054e45b22ce5 100644 --- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/Debugging/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/Debugging/BUILD.gn @@ -11,6 +11,7 @@ static_library("Debugging") { "DebugInfoSupport.cpp", "DebuggerSupport.cpp", "DebuggerSupportPlugin.cpp", + "ELFDebugObjectPlugin.cpp", "LLJITUtilsCBindings.cpp", "PerfSupportPlugin.cpp", "VTuneSupportPlugin.cpp", From 8592a65a436ff1955bf82fb4a57e1ba13708374a Mon Sep 17 00:00:00 2001 From: Jonathan Thackray Date: Tue, 18 Nov 2025 11:08:57 +0000 Subject: [PATCH 27/35] [AArch64][llvm] GICv5 instruction `GIC CDEOI` takes no operand (#167322) There was a minor oversight in commit 6836261ee; the AArch64 GICv5 instruction `GIC CDEOI` takes no operands, since the text of the specification says: ``` The Rt field should be set to 0b11111. If the Rt field is not set to 0b11111, it is CONSTRAINED UNPREDICTABLE whether: * The instruction is UNDEFINED. * The instruction behaves as if the Rt field is set to 0b11111. ``` --- llvm/lib/Target/AArch64/AArch64SystemOperands.td | 8 ++++---- llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp | 2 +- .../Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp | 2 +- llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s | 4 ++++ llvm/test/MC/AArch64/armv9.7a-gcie.s | 8 ++++---- 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td index 1dd132e9a7301..cb098751fd74d 100644 --- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td +++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td @@ -2602,14 +2602,14 @@ foreach n=0-15 in { //===----------------------------------------------------------------------===// // GIC -class GIC op1, bits<4> crn, bits<4> crm, bits<3> op2> { +class GIC op1, bits<4> crn, bits<4> crm, bits<3> op2, bit needsreg = 1> { string Name = name; bits<14> Encoding; let Encoding{13-11} = op1; let Encoding{10-7} = crn; let Encoding{6-3} = crm; let Encoding{2-0} = op2; - bit NeedsReg = 1; + bit NeedsReg = needsreg; string RequiresStr = [{ {AArch64::FeatureGCIE} }]; } @@ -2686,12 +2686,12 @@ def : GSB<"ack", 0b000, 0b1100, 0b0000, 0b001>; def : GICR<"cdia", 0b000, 0b1100, 0b0011, 0b000>; def : GICR<"cdnmia", 0b000, 0b1100, 0b0011, 0b001>; -// Op1 CRn CRm Op2 +// Op1 CRn CRm Op2, needsreg def : GIC<"cdaff", 0b000, 0b1100, 0b0001, 0b011>; def : GIC<"cddi", 0b000, 0b1100, 0b0010, 0b000>; def : GIC<"cddis", 0b000, 0b1100, 0b0001, 0b000>; def : GIC<"cden", 0b000, 0b1100, 0b0001, 0b001>; -def : GIC<"cdeoi", 0b000, 0b1100, 0b0001, 0b111>; +def : GIC<"cdeoi", 0b000, 0b1100, 0b0001, 0b111, 0>; def : GIC<"cdhm", 0b000, 0b1100, 0b0010, 0b001>; def : GIC<"cdpend", 0b000, 0b1100, 0b0001, 0b100>; def : GIC<"cdpri", 0b000, 0b1100, 0b0001, 0b010>; diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 5cc39319d71c0..433cb0387c470 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -4111,7 +4111,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, setRequiredFeatureString(GIC->getRequiredFeatures(), Str); return TokError(Str); } - ExpectRegister = true; + ExpectRegister = GIC->NeedsReg; createSysAlias(GIC->Encoding, Operands, S); } else if (Mnemonic == "gsb") { const AArch64GSB::GSB *GSB = AArch64GSB::lookupGSBByName(Op); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index bbc34ad35296c..3e4c1101fb8e1 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -1034,7 +1034,7 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, if (!GIC || !GIC->haveFeatures(STI.getFeatureBits())) return false; - NeedsReg = true; + NeedsReg = GIC->NeedsReg; Ins = "gic\t"; Name = std::string(GIC->Name); } else { diff --git a/llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s b/llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s index cffee7dbbe31e..84860857c3b8f 100644 --- a/llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s +++ b/llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s @@ -16,3 +16,7 @@ gicr x3, foo gic cdaff // CHECK-ERROR: error: specified gic op requires a register + +gic cdeoi, x3 +// CHECK-ERROR: error: specified gic op does not use a register + diff --git a/llvm/test/MC/AArch64/armv9.7a-gcie.s b/llvm/test/MC/AArch64/armv9.7a-gcie.s index 4fd5d2577e26a..74e95015f6c86 100644 --- a/llvm/test/MC/AArch64/armv9.7a-gcie.s +++ b/llvm/test/MC/AArch64/armv9.7a-gcie.s @@ -828,10 +828,10 @@ GIC CDEN, x3 // CHECK-UNKNOWN: d508c123 sys #0, c12, c1, #1, x3 // CHECK-ERROR: error: GIC cden requires: gcie -GIC CDEOI, x3 -// CHECK-INST: gic cdeoi, x3 -// CHECK-ENCODING: [0xe3,0xc1,0x08,0xd5] -// CHECK-UNKNOWN: d508c1e3 sys #0, c12, c1, #7, x3 +GIC CDEOI +// CHECK-INST: gic cdeoi +// CHECK-ENCODING: [0xff,0xc1,0x08,0xd5] +// CHECK-UNKNOWN: d508c1ff sys #0, c12, c1, #7 // CHECK-ERROR: error: GIC cdeoi requires: gcie GIC CDHM, x3 From fb829bf11feeb53f815a3abf539e63ec3a23ed3d Mon Sep 17 00:00:00 2001 From: Pradeep Kumar Date: Tue, 18 Nov 2025 16:40:31 +0530 Subject: [PATCH 28/35] [MLIR][NVVM] Add tcgen05.mma MLIR Ops (#164356) This commit adds support for tgen05.mma family of instructions in the NVVM MLIR dialect and lowers to LLVM Intrinsics. Please refer [PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions) for information --- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 545 +++++++++++++++ mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 612 +++++++++++++++++ .../nvvm/tcgen05-mma-block-scale-shared.mlir | 229 +++++++ .../nvvm/tcgen05-mma-block-scale-tensor.mlir | 229 +++++++ .../LLVMIR/nvvm/tcgen05-mma-invalid.mlir | 119 ++++ .../LLVMIR/nvvm/tcgen05-mma-shared.mlir | 442 ++++++++++++ .../tcgen05-mma-sp-block-scale-shared.mlir | 229 +++++++ .../tcgen05-mma-sp-block-scale-tensor.mlir | 229 +++++++ .../LLVMIR/nvvm/tcgen05-mma-sp-shared.mlir | 442 ++++++++++++ .../LLVMIR/nvvm/tcgen05-mma-sp-tensor.mlir | 634 ++++++++++++++++++ .../LLVMIR/nvvm/tcgen05-mma-tensor.mlir | 633 +++++++++++++++++ .../LLVMIR/nvvm/tcgen05-mma-ws-shared.mlir | 133 ++++ .../LLVMIR/nvvm/tcgen05-mma-ws-sp-shared.mlir | 133 ++++ .../LLVMIR/nvvm/tcgen05-mma-ws-sp-tensor.mlir | 133 ++++ .../LLVMIR/nvvm/tcgen05-mma-ws-tensor.mlir | 133 ++++ 15 files changed, 4875 insertions(+) create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-shared.mlir create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-tensor.mlir create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-invalid.mlir create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-shared.mlir create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-shared.mlir create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-tensor.mlir create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-shared.mlir create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-tensor.mlir create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-tensor.mlir create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-shared.mlir create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-shared.mlir create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-tensor.mlir create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-tensor.mlir diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 6e3a92b5bde42..87c73c4587485 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -4676,6 +4676,551 @@ def NVVM_ClusterLaunchControlQueryCancelOp }]; } +//===----------------------------------------------------------------------===// +// NVVM tcgen05.mma Ops +//===----------------------------------------------------------------------===// + +def Tcgen05MMAKindF16 : I32EnumAttrCase<"F16", 0, "f16">; +def Tcgen05MMAKindTF32 : I32EnumAttrCase<"TF32", 1, "tf32">; +def Tcgen05MMAKindF8F6F4 : I32EnumAttrCase<"F8F6F4", 2, "f8f6f4">; +def Tcgen05MMAKindINT8 : I32EnumAttrCase<"I8", 3, "i8">; + +def Tcgen05MMAKind : I32EnumAttr< + "Tcgen05MMAKind", + "tcgen05 MMA Supported Types", + [Tcgen05MMAKindF8F6F4, Tcgen05MMAKindINT8, Tcgen05MMAKindF16, + Tcgen05MMAKindTF32]> { + let cppNamespace = "::mlir::NVVM"; + let genSpecializedAttr = 0; +} + +def Tcgen05MMAKindAttr : EnumAttr { + let description = [{ + The Tcgen05MMAKind attribute describes the allowed set of types for matrix A and B in the tcgen05.mma.{sp} Op. The following are supported types for each kind: + + ``` + +-------------+--------------------------------------------+ + | Matrix Kind | supported types for A / B | + +-------------+--------------------------------------------+ + | f16 | f16, bf16 | + | tf32 | tf32 | + | f8f6f4 | e4m3, e5m2, e2m3, e3m2, e2m1 | + | i8 | unsigned 8b, signed 8b | + +-------------+--------------------------------------------+ + ``` + }]; + let assemblyFormat = "`<` $value `>`"; +} + +def Tcgen05MMACollectorOpDiscard : I32EnumAttrCase<"DISCARD", 0, "discard">; +def Tcgen05MMACollectorOpLastUse : I32EnumAttrCase<"LASTUSE", 1, "lastuse">; +def Tcgen05MMACollectorOpFill : I32EnumAttrCase<"FILL", 2, "fill">; +def Tcgen05MMACollectorOpUse : I32EnumAttrCase<"USE", 3, "use">; + +def Tcgen05MMACollectorOp : I32EnumAttr< + "Tcgen05MMACollectorOp", + "tcgen05.mma Collector Buffer Operation", + [Tcgen05MMACollectorOpDiscard, + Tcgen05MMACollectorOpLastUse, + Tcgen05MMACollectorOpFill, + Tcgen05MMACollectorOpUse]> { + let cppNamespace = "::mlir::NVVM"; + let genSpecializedAttr = 0; +} + +def Tcgen05MMACollectorOpAttr : EnumAttr { + let description = [{ + Tcgen05MMACollectorOp attribute specifies the collector buffer operations. + The following are the supported operations: + * discard : Release buffer after use (default) + * lastuse : Mark buffer for last use + * fill : Fill buffer + * use : Use buffer without modification + }]; + let assemblyFormat = "`<` $value `>`"; +} + +def NVVM_Tcgen05MMAOp : NVVM_Op<"tcgen05.mma", + [AttrSizedOperandSegments, + NVVMRequiresSMa<[100, 110]>]> { + let summary = "Performs MMA operation on 5th-gen tensor cores"; + + let description = [{ + The `tcgen05.mma` operation is an asynchronous tensor core instruction that + performs matrix multiplication, accumulation in a single fused operation. It + targets 5th-generation tensor cores, providing developers with fine-grained + control over execution and scheduling. + + ``` + D = A * B + (D * 2^ -scaleInputD) // if `scaleInputD` is provided + D = A * B // if `enableInputD` is false + D = A * B + D // otherwise + ``` + + where: + - A is an `M x K` matrix in tensor memory or described using shared memory descriptor + - B is a `K x N` matrix described using shared memory descriptor + - D is an `M x N` accumulator matrix in tensor memory + + The `shared memory descriptor` can be generated using `tcgen05.mma_smem_desc` Op + + - idesc is a 32-bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor) + + Optional Operands: + - `scaleInputD` is an Immediate value operand used for scaling D matrix by 2 ^ (-scaleInputD). The valid range is [0, 15] + + - `disableOutputLane` is a vector mask for selective output + * vector<4 x i32> when ctaGroup is CTA_1 + * vector<8 x i32> when ctaGroup is CTA_2 + + Required Attributes: + - `kind` is a Tcgen05MMAKind attribute + + - `ctaGroup` specifies CTA group configuration + * cta_1: MMA will be performed on the current thread's CTA + * cta_2: MMA will be performed on the current thread and it's peer CTA + + Default Attributes: + - collectorOp is a Tcgen05MMACollectorOp attribute with matrix A as the collector buffer + + - `aShift` shifts the rows of the A matrix down by one row and can only be + applied if A is in tensor memory + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma) + }]; + + let arguments = (ins + Tcgen05MMAKindAttr:$kind, + CTAGroupKindAttr:$ctaGroup, + DefaultValuedAttr:$collectorOp, + UnitAttr:$aShift, + LLVM_PointerTensor:$matrixD, + AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA, + I64:$matrixB, + I32:$idesc, + I1:$enableInputD, + Optional:$scaleInputD, + Optional>:$disableOutputLane + ); + + let assemblyFormat = [{ + $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD (`scale` `=` $scaleInputD^)? + (`mask` `=` $disableOutputLane^)? attr-dict `:` `(` type(operands) `)` + }]; + + let hasVerifier = true; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder); + }]; + + let llvmBuilder = [{ + auto [ID, args] = NVVM::Tcgen05MMAOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, ID, args); + }]; +} + +def NVVM_Tcgen05MMASparseOp : NVVM_Op<"tcgen05.mma.sp", + [AttrSizedOperandSegments, + NVVMRequiresSMa<[100, 110]>]> { + let summary = "Performs MMA operation with sparse A matrix on 5th-gen tensor cores"; + + let description = [{ + The `tcgen05.mma.sp` operation is an asynchronous tensor core instruction + that performs matrix multiplication, accumulation with sparse `A` matrix in + a single fused operation. It targets 5th-generation tensor cores, providing + developers with fine-grained control over execution and scheduling. + + ``` + D = A * B + (D * 2^ -scaleInputD) // if `scaleInputD` is provided + D = A * B // if `enableInputD` is false + D = A * B + D // otherwise + ``` + + where: + - A is an `M x (K / 2)` matrix in tensor memory or described using shared memory descriptor + - B is a `K x N` matrix described using shared memory descriptor + - D is an `M x N` accumulator matrix in tensor memory + - sparseMetadata located in tensor memory specifies the mapping of the `K / 2` + non-zero elements to the K elements before performing the MMA operation + + Other attributes and operands are similar to that of tcgen05.mma Op + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-sp) + }]; + + let arguments = (ins + Tcgen05MMAKindAttr:$kind, + CTAGroupKindAttr:$ctaGroup, + DefaultValuedAttr:$collectorOp, + UnitAttr:$aShift, + LLVM_PointerTensor:$matrixD, + AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA, + I64:$matrixB, + I32:$idesc, + I1:$enableInputD, + LLVM_PointerTensor:$sparseMetadata, + Optional:$scaleInputD, + Optional>:$disableOutputLane + ); + + let assemblyFormat = [{ + $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD `,` $sparseMetadata (`scale` `=` $scaleInputD^)? (`mask` `=` $disableOutputLane^)? attr-dict `:` `(` type(operands) `)` + }]; + + let hasVerifier = true; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder); + }]; + + let llvmBuilder = [{ + auto [ID, args] = NVVM::Tcgen05MMASparseOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, ID, args); + }]; +} + +def Tcgen05MMAKindMXF8F6F4 : I32EnumAttrCase<"MXF8F6F4", 0, "mxf8f6f4">; +def Tcgen05MMAKindMXF4 : I32EnumAttrCase<"MXF4", 1, "mxf4">; +def Tcgen05MMAKindMXF4NVF4 : I32EnumAttrCase<"MXF4NVF4", 2, "mxf4nvf4">; + +def Tcgen05MMABlockScaleKind : I32EnumAttr< + "Tcgen05MMABlockScaleKind", + "tcgen05.mma.block_scale supported types", + [Tcgen05MMAKindMXF8F6F4, Tcgen05MMAKindMXF4, Tcgen05MMAKindMXF4NVF4]> { + let cppNamespace = "::mlir::NVVM"; + let genSpecializedAttr = 0; +} + +def Tcgen05MMABlockScaleKindAttr : EnumAttr { + let description = [{ + The Tcgen05MMABlockScaleKind attribute describes the allowed set of types for matrix A and B in the tcgen05.mma.{sp}.block_scale Op. The following are supported types for each kind: + + ``` + +--------------+-------------------------------------------+ + | Matrix Kind | supported types for A / B | + +--------------+-------------------------------------------+ + | mxf8f6f4 | e4m3, e5m3, e2m3, e3m2, e2m1 | + | mxf4 | e2m1 | + | mxf4nvf4 | e2m1 | + +--------------+-------------------------------------------+ + ``` + }]; + let assemblyFormat = "`<` $value `>`"; +} + +def Tcgen05MMABlockScaleDefault : I32EnumAttrCase<"DEFAULT", 0, "default">; +def Tcgen05MMABlockScaleBlock16 : I32EnumAttrCase<"BLOCK16", 1, "block16">; +def Tcgen05MMABlockScaleBlock32 : I32EnumAttrCase<"BLOCK32", 2, "block32">; + +def Tcgen05MMABlockScale + : I32EnumAttr<"Tcgen05MMABlockScale", + "tcgen05.mma block scale attribute", + [Tcgen05MMABlockScaleDefault, Tcgen05MMABlockScaleBlock16, + Tcgen05MMABlockScaleBlock32]> { + let cppNamespace = "::mlir::NVVM"; + let genSpecializedAttr = 0; +} + +def Tcgen05MMABlockScaleAttr : EnumAttr { + let assemblyFormat = "`<` $value `>`"; +} + +def NVVM_Tcgen05MMABlockScaleOp : NVVM_Op<"tcgen05.mma.block_scale", + [NVVMRequiresSMa<[100, 110]>]> { + let summary = "Performs block scaled MMA operation on 5th-gen tensor cores"; + + let description = [{ + The `tcgen05.mma.block_scale` operation is an asynchronous tensor core instruction + that performs matrix multiplication, accumulation with block scaling in a + single fused operation. It targets 5th-generation tensor cores, providing + developers with fine-grained control over execution and scheduling. + + ``` + D = (A * scale_a) * (B * scale_b)` // if `enableInputD` is false + D = (A * scale_a) * (B * scale_b) + D` + ``` + + where: + - A is an M x (K / 2) matrix in tensor memory or described using shared memory descriptor + - B is a K x N matrix described using shared memory descriptor + - D is an M x N accumulator matrix in tensor memory + - `scale_a` and `scale_b` are matrices in tensor memory used to scale `A` and `B` respectively + + The `shared memory descriptor` can be generated using `tcgen05.mma_smem_desc` Op + + - `idesc` is a 32 bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor) + + Required Attributes: + - `kind` is a Tcgen05MMABlockScaleKind attribute + + - `ctaGroup` specifies CTA group configuration + * cta_1: MMA will be performed on the current thread's CTA + * cta_2: MMA will be performed on the current thread and it's peer CTA + + Default Attributes: + - collectorOp is a Tcgen05MMACollectorOp attribute with matrix A as the collector buffer + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma) + }]; + + let arguments = (ins + Tcgen05MMABlockScaleKindAttr:$kind, + CTAGroupKindAttr:$ctaGroup, + DefaultValuedAttr:$blockScale, + DefaultValuedAttr:$collectorOp, + LLVM_PointerTensor:$matrixD, + AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA, + I64:$matrixB, + I32:$idesc, I1:$enableInputD, + LLVM_PointerTensor:$scaleA, + LLVM_PointerTensor:$scaleB + ); + + let assemblyFormat = [{ + $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD `,` $scaleA `,` $scaleB + attr-dict `:` `(` type(operands) `)` + }]; + + let hasVerifier = true; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder); + }]; + + let llvmBuilder = [{ + auto [ID, args] = NVVM::Tcgen05MMABlockScaleOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, ID, args); + }]; +} + +def NVVM_Tcgen05MMASparseBlockScaleOp : NVVM_Op<"tcgen05.mma.sp.block_scale", + [NVVMRequiresSMa<[100, 110]>]> { + let summary = "Performs block scaled MMA operation with sparse A matrix on 5th-gen tensor cores"; + + let description = [{ + The `tcgen05.mma.sp.block_scale` operation is an asynchronous tensor core + instruction that performs matrix multiplication, accumulation with block + scaling, and sparse `A` matrix in a single fused operation. It targets + 5th-generation tensor cores, providing developers with fine-grained control + over execution, and scheduling. + + ``` + D = (A * scale_a) * (B * scale_b) // if `enableInputD` is specified + D = (A * scale_a) * (B * scale_b) + D // otherwise + ``` + + where: + - A is an M x (K / 2) matrix in tensor memory or described using shared memory descriptor + - B is a K x N matrix described using shared memory descriptor + - D is an M x N accumulator matrix in tensor memory + - `scale_a` and `scale_b` are matrices in tensor memory used to scale `A` and `B` respectively + + Other attributes and operands are similar to that of tcgen05.mma.block_scale Op + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-sp) + }]; + + let arguments = (ins + Tcgen05MMABlockScaleKindAttr:$kind, + CTAGroupKindAttr:$ctaGroup, + DefaultValuedAttr:$blockScale, + DefaultValuedAttr:$collectorOp, + LLVM_PointerTensor:$matrixD, + AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA, + I64:$matrixB, + I32:$idesc, + I1:$enableInputD, + LLVM_PointerTensor:$sparseMetadata, + LLVM_PointerTensor:$scaleA, + LLVM_PointerTensor:$scaleB + ); + + let assemblyFormat = [{ + $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD `,` $sparseMetadata `,` $scaleA `,` $scaleB + attr-dict `:` `(` type(operands) `)` + }]; + + let hasVerifier = true; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder); + }]; + + let llvmBuilder = [{ + auto [ID, args] = NVVM::Tcgen05MMASparseBlockScaleOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, ID, args); + }]; +} + +def Tcgen05MMACollectorBBuffer0 : I32EnumAttrCase<"B0", 0, "b0">; +def Tcgen05MMACollectorBBuffer1 : I32EnumAttrCase<"B1", 1, "b1">; +def Tcgen05MMACollectorBBuffer2 : I32EnumAttrCase<"B2", 2, "b2">; +def Tcgen05MMACollectorBBuffer3 : I32EnumAttrCase<"B3", 3, "b3">; + +def Tcgen05MMACollectorBBuffer : I32EnumAttr< + "Tcgen05MMACollectorBBuffer", + "tcgen05 MMA Collector Buffer B Attribute", + [Tcgen05MMACollectorBBuffer0, Tcgen05MMACollectorBBuffer1, Tcgen05MMACollectorBBuffer2, + Tcgen05MMACollectorBBuffer3]> { + let cppNamespace = "::mlir::NVVM"; + let genSpecializedAttr = 0; +} + +def Tcgen05MMACollectorBBufferAttr : EnumAttr { + let assemblyFormat = "`<` $value `>`"; +} + +def NVVM_Tcgen05MMAWsOp : NVVM_Op<"tcgen05.mma.ws", + [NVVMRequiresSMa<[100, 110]>]> { + let summary = "Performs weight stationary convolution MMA operation on 5th-gen tensor cores"; + + let description = [{ + The `tcgen05.mma.ws` operation is an asynchronous tensor core instruction + that performs weight stationary convolution matrix multiplication, accumulation + in a single fused operation. It targets 5th-generation tensor cores, providing + developers with fine-grained control over execution, and scheduling. + + ``` + D = A * B` // if `enableInputD` is false + D = A * B + D` // otherwise + ``` + + where: + - A is an `M x K` matrix in tensor memory or described using shared memory descriptor + - B is a `K x N` matrix described using shared memory descriptor + - D is an `M x N` accumulator matrix in tensor memory + + The `shared memory descriptor` can be generated using `tcgen05.mma_smem_desc` Op + + - idesc is a 32-bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor) + + Optional Operands: + - zeroColMask is a 64 bit value representing the [Zero-column mask descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-zero-column-mask-descriptor) + + Required Attributes: + - `kind` is a Tcgen05MMAKind attribute + + Default Valued Attributes: + - collectorBBuffer specifies collector buffer for matrix B: b0 (default), b1, b2, b3 + + - collectorOp is a Tcgen05MMACollectorOp attribute with matrix B as the collector buffer + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-ws) + }]; + + let arguments = (ins + Tcgen05MMAKindAttr:$kind, + DefaultValuedAttr:$collectorBBuffer, + DefaultValuedAttr:$collectorOp, + LLVM_PointerTensor:$matrixD, + AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA, + I64:$matrixB, + I32:$idesc, + I1:$enableInputD, + Optional:$zeroColMask + ); + + let assemblyFormat = [{ + $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD (`,` $zeroColMask^)? + attr-dict `:` `(` type(operands) `)` + }]; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder); + }]; + + let llvmBuilder = [{ + auto [ID, args] = + NVVM::Tcgen05MMAWsOp::getIntrinsicIDAndArgs(*op, moduleTranslation, builder); + createIntrinsicCall(builder, ID, args); + }]; +} + +def NVVM_Tcgen05MMAWsSparseOp : NVVM_Op<"tcgen05.mma.ws.sp", + [NVVMRequiresSMa<[100, 110]>]> { + let summary = "Performs weight stationary convolution MMA with sparse A matrix on 5th-gen tensor cores"; + + let description = [{ + The `tcgen05.mma.ws.sp` operation is an asynchronous tensor core instruction + that performs weight stationary convolution matrix multiplication, accumulation + with sparse `A` matrix in a single fused operation. It targets 5th-generation + tensor cores, providing developers with fine-grained control over execution, + and scheduling. + + ``` + D = A * B` // if `enableInputD` is false + D = A * B + D` // otherwise + ``` + + where: + - A is an M x (K / 2) matrix in memory or descriptor format + - B is a K x N matrix + - D is an M x N accumulator matrix + - sparseMetadata located in tensor memory specifies the mapping of the `K / 2` + non-zero elements to the K elements before performing the MMA operation + + Other attributes and operands are similar to that of tcgen05.mma.ws Op + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-ws-sp) + }]; + + let arguments = (ins + Tcgen05MMAKindAttr:$kind, + DefaultValuedAttr:$collectorBBuffer, + DefaultValuedAttr:$collectorOp, + LLVM_PointerTensor:$matrixD, + AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA, + I64:$matrixB, + I32:$idesc, + I1:$enableInputD, + LLVM_PointerTensor:$sparseMetadata, + Optional:$zeroColMask + ); + + let assemblyFormat = [{ + $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD `,` $sparseMetadata (`,` $zeroColMask^)? attr-dict `:` `(` type(operands) `)` + }]; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder); + }]; + + let llvmBuilder = [{ + auto [ID, args] = NVVM::Tcgen05MMAWsSparseOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, ID, args); + }]; +} + //===----------------------------------------------------------------------===// // NVVM target attribute. //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index 7ac427dbe3941..369305b40c689 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -31,6 +31,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/NVVMIntrinsicUtils.h" #include "llvm/Support/Casting.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/NVPTXAddrSpace.h" @@ -60,6 +61,18 @@ static bool isPtrInSharedCTASpace(mlir::Value ptr) { return isPtrInAddrSpace(ptr, NVVMMemorySpace::Shared); } +// Helper method to convert CtaGroupKind in NVVM Dialect to CtaGroupKind in LLVM +static constexpr llvm::nvvm::CTAGroupKind +getNVVMCtaGroupKind(NVVM::CTAGroupKind ctaGroup) { + switch (ctaGroup) { + case NVVM::CTAGroupKind::CTA_1: + return llvm::nvvm::CTAGroupKind::CG_1; + case NVVM::CTAGroupKind::CTA_2: + return llvm::nvvm::CTAGroupKind::CG_2; + } + llvm_unreachable("unsupported cta_group value"); +} + //===----------------------------------------------------------------------===// // Verifier methods //===----------------------------------------------------------------------===// @@ -3091,6 +3104,605 @@ NVVM::IDArgPair ClusterLaunchControlQueryCancelOp::getIntrinsicIDAndArgs( return {intrinsicID, args}; } +//===----------------------------------------------------------------------===// +// NVVM tcgen05.mma functions +//===----------------------------------------------------------------------===// + +mlir::NVVM::IDArgPair +Tcgen05MMAOp::getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder) { + + auto thisOp = cast(op); + llvm::SmallVector args; + + args.push_back(mt.lookupValue(thisOp.getMatrixD())); + + llvm::Value *A = mt.lookupValue(thisOp.getMatrixA()); + const bool isATensor = isa(A->getType()); + args.push_back(A); + + args.push_back(mt.lookupValue(thisOp.getMatrixB())); + args.push_back(mt.lookupValue(thisOp.getIdesc())); + args.push_back(mt.lookupValue(thisOp.getEnableInputD())); + + using EnableAShiftArray = std::array; + using CtaGroupArray = std::array; + using IsATensorArray = std::array; + using HasScaleInputDArray = std::array; + using HasDisableOutputLaneArray = std::array; + + // [hasDisableOutputLane][hasScaleInputD][isATensor][CtaGroup][EnableAShift] + static constexpr HasDisableOutputLaneArray tcgen05MMAIDs = { + { // without diable output lane + {{// without scale input D + {{ + // shared + {{// cg1 + {llvm::Intrinsic::nvvm_tcgen05_mma_shared, notIntrinsic}, + // cg2 + {llvm::Intrinsic::nvvm_tcgen05_mma_shared, notIntrinsic}}}, + {{// tensor + { + // cg1 + llvm::Intrinsic::nvvm_tcgen05_mma_tensor, + llvm::Intrinsic::nvvm_tcgen05_mma_tensor_ashift, + }, + { + // cg2 + llvm::Intrinsic::nvvm_tcgen05_mma_tensor, + llvm::Intrinsic::nvvm_tcgen05_mma_tensor_ashift, + }}}, + }}, + // with scale input D + {{ // shared + {{// cg1 + {llvm::Intrinsic::nvvm_tcgen05_mma_shared_scale_d, notIntrinsic}, + // cg2 + {llvm::Intrinsic::nvvm_tcgen05_mma_shared_scale_d, notIntrinsic}}}, + {{// tensor + { + // cg1 + llvm::Intrinsic::nvvm_tcgen05_mma_tensor_scale_d, + llvm::Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_ashift, + }, + { + // cg2 + llvm::Intrinsic::nvvm_tcgen05_mma_tensor_scale_d, + llvm::Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_ashift, + }}}}}}}, + // with disable output lane + {{ // without scale input D + {{ // shared + {{// cg1 + {llvm::Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1, + notIntrinsic}, + // cg2 + {llvm::Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2, + notIntrinsic}}}, + {{// cg1 + { + llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_disable_output_lane_cg1, + llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift, + }, + // cg2 + { + llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_disable_output_lane_cg2, + llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift, + }}}}}, + // with scale input D + {{ // shared + {{// cg1 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1, + notIntrinsic}, + // cg2 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2, + notIntrinsic}}}, + // tensor + {{// cg1 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1, + llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift}, + // cg2 + { + llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2, + llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift, + }}}}}}}}}; + + llvm::Value *ScaleInputD = mt.lookupValue(thisOp.getScaleInputD()); + bool hasScaleInputD = ScaleInputD != nullptr; + + llvm::Value *DisableOutputLane = + mt.lookupValue(thisOp.getDisableOutputLane()); + bool hasDisableOutputLane = DisableOutputLane != nullptr; + + const unsigned ctaGroup = + static_cast(getNVVMCtaGroupKind(thisOp.getCtaGroup())); + + llvm::Intrinsic::ID ID = + tcgen05MMAIDs[hasDisableOutputLane][hasScaleInputD][isATensor] + [ctaGroup - 1][thisOp.getAShift()]; + + assert(ID != notIntrinsic && "Invalid intrinsic for Tcgen05MMAOp."); + + if (hasScaleInputD) + args.push_back(ScaleInputD); + + if (hasDisableOutputLane) + args.push_back(DisableOutputLane); + + args.push_back(builder.getInt32(static_cast(thisOp.getKind()))); + + if (!hasDisableOutputLane) + args.push_back(builder.getInt32(ctaGroup)); + + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorOp()))); + + return {ID, args}; +} + +static LogicalResult +verifyTcgen05MMAOp(bool isATensor, mlir::Value disableOutputLane, + NVVM::CTAGroupKind ctaGroup, bool hasAShift, + NVVM::Tcgen05MMACollectorOp collectorOp, Location loc) { + + if (disableOutputLane) { + mlir::VectorType disableOutputLaneType = + cast(disableOutputLane.getType()); + if ((ctaGroup == NVVM::CTAGroupKind::CTA_1 && + disableOutputLaneType.getNumElements() != 4) || + (ctaGroup == NVVM::CTAGroupKind::CTA_2 && + disableOutputLaneType.getNumElements() != 8)) + return emitError(loc) << "Disable Output Lane of length " + << disableOutputLaneType.getNumElements() + << " is incompatible with CtaGroupAttr"; + } + + if (hasAShift && !isATensor) + return emitError( + loc, "A-shift can be applied only when matrix A is in tensor memory"); + + if (hasAShift == true && (collectorOp == Tcgen05MMACollectorOp::FILL || + collectorOp == Tcgen05MMACollectorOp::USE)) + return emitError( + loc, "Cannot use collector buffer operation fill or use with ashift"); + + return success(); +} + +LogicalResult Tcgen05MMAOp::verify() { + return verifyTcgen05MMAOp(isa(getMatrixA().getType()), + getDisableOutputLane(), getCtaGroup(), getAShift(), + getCollectorOp(), getLoc()); +} + +//===----------------------------------------------------------------------===// +// NVVM tcgen05.mma.sp functions +//===----------------------------------------------------------------------===// + +mlir::NVVM::IDArgPair Tcgen05MMASparseOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + + auto thisOp = cast(op); + llvm::SmallVector args; + + args.push_back(mt.lookupValue(thisOp.getMatrixD())); + + llvm::Value *A = mt.lookupValue(thisOp.getMatrixA()); + bool isATensor = isa(A->getType()); + args.push_back(A); + + args.push_back(mt.lookupValue(thisOp.getMatrixB())); + args.push_back(mt.lookupValue(thisOp.getIdesc())); + args.push_back(mt.lookupValue(thisOp.getEnableInputD())); + args.push_back(mt.lookupValue(thisOp.getSparseMetadata())); + + using EnableAShiftArray = std::array; + using CtaGroupArray = std::array; + using IsATensorArray = std::array; + using HasScaleInputDArray = std::array; + using HasDisableOutputLaneArray = std::array; + + // [hasDisableOutputLane][hasScaleInputD][isATensor][CtaGroup][EnableAShift] + static constexpr HasDisableOutputLaneArray tcgen05MMASparseIDs = { + { // without diable output lane + {{// without scale input D + {{ + // shared + {{// cg1 + {llvm::Intrinsic::nvvm_tcgen05_mma_sp_shared, notIntrinsic}, + // cg2 + {llvm::Intrinsic::nvvm_tcgen05_mma_sp_shared, notIntrinsic}}}, + {{// tensor + { + // cg1 + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor, + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor_ashift, + }, + { + // cg2 + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor, + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor_ashift, + }}}, + }}, + // with scale input D + {{ // shared + {{// cg1 + {llvm::Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d, + notIntrinsic}, + // cg2 + {llvm::Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d, + notIntrinsic}}}, + {{// tensor + { + // cg1 + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d, + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_ashift, + }, + { + // cg2 + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d, + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_ashift, + }}}}}}}, + // with disable output lane + {{ // without scale input D + {{ // shared + {{// cg1 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1, + notIntrinsic}, + // cg2 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2, + notIntrinsic}}}, + {{// cg1 + { + llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1, + llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift, + }, + // cg2 + { + llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2, + llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift, + }}}}}, + // with scale input D + {{ // shared + {{// cg1 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1, + notIntrinsic}, + // cg2 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2, + notIntrinsic}}}, + // tensor + {{// cg1 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1, + llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift}, + // cg2 + { + llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2, + llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift, + }}}}}}}}}; + + llvm::Value *ScaleInputD = mt.lookupValue(thisOp.getScaleInputD()); + bool hasScaleInputD = ScaleInputD != nullptr; + + llvm::Value *DisableOutputLane = + mt.lookupValue(thisOp.getDisableOutputLane()); + bool hasDisableOutputLane = DisableOutputLane != nullptr; + + unsigned ctaGroup = + static_cast(getNVVMCtaGroupKind(thisOp.getCtaGroup())); + + llvm::Intrinsic::ID ID = + tcgen05MMASparseIDs[hasDisableOutputLane][hasScaleInputD][isATensor] + [ctaGroup - 1][thisOp.getAShift()]; + + assert(ID != notIntrinsic && "Invalid intrinsic for Tcgen05MMASparseOp."); + + if (hasScaleInputD) + args.push_back(ScaleInputD); + + if (hasDisableOutputLane) + args.push_back(DisableOutputLane); + + args.push_back(builder.getInt32(static_cast(thisOp.getKind()))); + + if (!hasDisableOutputLane) + args.push_back(builder.getInt32(ctaGroup)); + + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorOp()))); + + return {ID, args}; +} + +LogicalResult Tcgen05MMASparseOp::verify() { + return verifyTcgen05MMAOp(isa(getMatrixA().getType()), + getDisableOutputLane(), getCtaGroup(), getAShift(), + getCollectorOp(), getLoc()); +} + +//===----------------------------------------------------------------------===// +// NVVM tcgen05.mma.block_scale functions +//===----------------------------------------------------------------------===// + +mlir::NVVM::IDArgPair Tcgen05MMABlockScaleOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + + auto thisOp = cast(op); + llvm::SmallVector args; + + args.push_back(mt.lookupValue(thisOp.getMatrixD())); + + llvm::Value *A = mt.lookupValue(thisOp.getMatrixA()); + bool isATensor = isa(A->getType()); + args.push_back(A); + + args.push_back(mt.lookupValue(thisOp.getMatrixB())); + args.push_back(mt.lookupValue(thisOp.getIdesc())); + args.push_back(mt.lookupValue(thisOp.getEnableInputD())); + args.push_back(mt.lookupValue(thisOp.getScaleA())); + args.push_back(mt.lookupValue(thisOp.getScaleB())); + args.push_back(builder.getInt32( + static_cast(getNVVMCtaGroupKind(thisOp.getCtaGroup())))); + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorOp()))); + + auto kind = thisOp.getKind(); + auto blockScale = thisOp.getBlockScale(); + llvm::Intrinsic::ID ID = [&]() { + if (kind == NVVM::Tcgen05MMABlockScaleKind::MXF8F6F4) { + if (blockScale == NVVM::Tcgen05MMABlockScale::DEFAULT) { + return isATensor ? llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_mxf8f6f4_block_scale + : llvm::Intrinsic:: + nvvm_tcgen05_mma_shared_mxf8f6f4_block_scale; + } else if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK32) { + return isATensor + ? llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_mxf8f6f4_block_scale_block32 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_shared_mxf8f6f4_block_scale_block32; + } + } else if (kind == NVVM::Tcgen05MMABlockScaleKind::MXF4) { + if (blockScale == NVVM::Tcgen05MMABlockScale::DEFAULT) { + return isATensor + ? llvm::Intrinsic::nvvm_tcgen05_mma_tensor_mxf4_block_scale + : llvm::Intrinsic::nvvm_tcgen05_mma_shared_mxf4_block_scale; + } else if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK32) { + return isATensor ? llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_mxf4_block_scale_block32 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_shared_mxf4_block_scale_block32; + } + } else if (kind == NVVM::Tcgen05MMABlockScaleKind::MXF4NVF4) { + if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK32) { + return isATensor + ? llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_mxf4nvf4_block_scale_block32 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_shared_mxf4nvf4_block_scale_block32; + + } else if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK16) { + return isATensor + ? llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_mxf4nvf4_block_scale_block16 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_shared_mxf4nvf4_block_scale_block16; + } + } + llvm_unreachable("Invalid tcgen05.mma.block_scale attributes"); + }(); + + return {ID, args}; +} + +static LogicalResult +verifyTcgen05MMABlockScaleOp(NVVM::Tcgen05MMACollectorOp collectorOp, + NVVM::Tcgen05MMABlockScaleKind kind, + NVVM::Tcgen05MMABlockScale blockScale, + Location loc) { + + if (blockScale == NVVM::Tcgen05MMABlockScale::DEFAULT && + kind == Tcgen05MMABlockScaleKind::MXF4NVF4) + return emitError(loc, "mxf4nvf4 requires block scale attribute"); + + if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK16 && + kind != Tcgen05MMABlockScaleKind::MXF4NVF4) + return emitError(loc, + llvm::formatv("{} kind does not support block16 attribute", + stringifyEnum(kind))); + + return success(); +} + +LogicalResult Tcgen05MMABlockScaleOp::verify() { + return verifyTcgen05MMABlockScaleOp(getCollectorOp(), getKind(), + getBlockScale(), getLoc()); +} + +//===----------------------------------------------------------------------===// +// NVVM tcgen05.mma.sp.block_scale functions +//===----------------------------------------------------------------------===// + +mlir::NVVM::IDArgPair Tcgen05MMASparseBlockScaleOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + + auto thisOp = cast(op); + llvm::SmallVector args; + + args.push_back(mt.lookupValue(thisOp.getMatrixD())); + + llvm::Value *A = mt.lookupValue(thisOp.getMatrixA()); + bool isATensor = isa(A->getType()); + args.push_back(A); + + args.push_back(mt.lookupValue(thisOp.getMatrixB())); + args.push_back(mt.lookupValue(thisOp.getIdesc())); + args.push_back(mt.lookupValue(thisOp.getEnableInputD())); + args.push_back(mt.lookupValue(thisOp.getSparseMetadata())); + args.push_back(mt.lookupValue(thisOp.getScaleA())); + args.push_back(mt.lookupValue(thisOp.getScaleB())); + args.push_back(builder.getInt32( + static_cast(getNVVMCtaGroupKind(thisOp.getCtaGroup())))); + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorOp()))); + + auto kind = thisOp.getKind(); + auto blockScale = thisOp.getBlockScale(); + llvm::Intrinsic::ID ID = [&]() { + if (kind == NVVM::Tcgen05MMABlockScaleKind::MXF8F6F4) { + if (blockScale == NVVM::Tcgen05MMABlockScale::DEFAULT) { + return isATensor ? llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_mxf8f6f4_block_scale + : llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_mxf8f6f4_block_scale; + } else if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK32) { + return isATensor + ? llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_mxf8f6f4_block_scale_block32 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_mxf8f6f4_block_scale_block32; + } + } else if (kind == NVVM::Tcgen05MMABlockScaleKind::MXF4) { + if (blockScale == NVVM::Tcgen05MMABlockScale::DEFAULT) { + return isATensor ? llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_mxf4_block_scale + : llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_mxf4_block_scale; + } else if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK32) { + return isATensor + ? llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_mxf4_block_scale_block32 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_mxf4_block_scale_block32; + } + } else if (kind == NVVM::Tcgen05MMABlockScaleKind::MXF4NVF4) { + if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK32) { + return isATensor + ? llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_mxf4nvf4_block_scale_block32 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_mxf4nvf4_block_scale_block32; + + } else if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK16) { + return isATensor + ? llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_mxf4nvf4_block_scale_block16 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_mxf4nvf4_block_scale_block16; + } + } + llvm_unreachable("Invalid tcgen05.mma.sp.block_scale attributes"); + }(); + + return {ID, args}; +} + +LogicalResult Tcgen05MMASparseBlockScaleOp::verify() { + return verifyTcgen05MMABlockScaleOp(getCollectorOp(), getKind(), + getBlockScale(), getLoc()); +} + +//===----------------------------------------------------------------------===// +// NVVM tcgen05.mma.ws functions +//===----------------------------------------------------------------------===// + +mlir::NVVM::IDArgPair Tcgen05MMAWsOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + + auto thisOp = cast(op); + llvm::SmallVector args; + + args.push_back(mt.lookupValue(thisOp.getMatrixD())); + + llvm::Value *A = mt.lookupValue(thisOp.getMatrixA()); + bool isATensor = isa(A->getType()); + args.push_back(A); + + args.push_back(mt.lookupValue(thisOp.getMatrixB())); + args.push_back(mt.lookupValue(thisOp.getIdesc())); + args.push_back(mt.lookupValue(thisOp.getEnableInputD())); + + mlir::Value ZeroColMask = thisOp.getZeroColMask(); + llvm::Intrinsic::ID ID = notIntrinsic; + if (ZeroColMask) { + args.push_back(mt.lookupValue(ZeroColMask)); + ID = isATensor ? llvm::Intrinsic::nvvm_tcgen05_mma_ws_tensor_zero_col_mask + : llvm::Intrinsic::nvvm_tcgen05_mma_ws_shared_zero_col_mask; + } else + ID = isATensor ? llvm::Intrinsic::nvvm_tcgen05_mma_ws_tensor + : llvm::Intrinsic::nvvm_tcgen05_mma_ws_shared; + + args.push_back(builder.getInt32(static_cast(thisOp.getKind()))); + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorBBuffer()))); + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorOp()))); + + return {ID, args}; +} + +//===----------------------------------------------------------------------===// +// NVVM tcgen05.mma.ws.sp functions +//===----------------------------------------------------------------------===// + +mlir::NVVM::IDArgPair Tcgen05MMAWsSparseOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + + auto thisOp = cast(op); + llvm::SmallVector args; + + args.push_back(mt.lookupValue(thisOp.getMatrixD())); + + llvm::Value *A = mt.lookupValue(thisOp.getMatrixA()); + bool isATensor = isa(A->getType()); + args.push_back(A); + + args.push_back(mt.lookupValue(thisOp.getMatrixB())); + args.push_back(mt.lookupValue(thisOp.getIdesc())); + args.push_back(mt.lookupValue(thisOp.getEnableInputD())); + args.push_back(mt.lookupValue(thisOp.getSparseMetadata())); + + mlir::Value ZeroColMask = thisOp.getZeroColMask(); + llvm::Intrinsic::ID ID = notIntrinsic; + if (ZeroColMask) { + args.push_back(mt.lookupValue(ZeroColMask)); + ID = isATensor + ? llvm::Intrinsic::nvvm_tcgen05_mma_ws_sp_tensor_zero_col_mask + : llvm::Intrinsic::nvvm_tcgen05_mma_ws_sp_shared_zero_col_mask; + } else + ID = isATensor ? llvm::Intrinsic::nvvm_tcgen05_mma_ws_sp_tensor + : llvm::Intrinsic::nvvm_tcgen05_mma_ws_sp_shared; + + args.push_back(builder.getInt32(static_cast(thisOp.getKind()))); + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorBBuffer()))); + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorOp()))); + + return {ID, args}; +} + //===----------------------------------------------------------------------===// // NVVMDialect initialization, type parsing, and registration. //===----------------------------------------------------------------------===// diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-shared.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-shared.mlir new file mode 100644 index 0000000000000..db4574bfaf78f --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-shared.mlir @@ -0,0 +1,229 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_mxf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_mxf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-tensor.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-tensor.mlir new file mode 100644 index 0000000000000..a15c3fb73de9c --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-tensor.mlir @@ -0,0 +1,229 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_mxf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_mxf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-invalid.mlir new file mode 100644 index 0000000000000..f46b35a910fd9 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-invalid.mlir @@ -0,0 +1,119 @@ +// RUN: mlir-translate --mlir-to-llvmir -verify-diagnostics -split-input-file %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLanev4: vector<4 x i32>, %disableOutputLanev8: vector<8 x i32>) { + // expected-error @below {{Disable Output Lane of length 8 is incompatible with CtaGroupAttr}} + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLanev8 + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLanev4: vector<4 x i32>, %disableOutputLanev8: vector<8 x i32>) { + // expected-error @below {{Disable Output Lane of length 8 is incompatible with CtaGroupAttr}} + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLanev8 + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_shared_ashift +llvm.func @nvvm_tcgen05_mma_shared_ashift(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + // expected-error @below {{A-shift can be applied only when matrix A is in tensor memory}} + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, i64, i64, i32, i1) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_ashift +llvm.func @nvvm_tcgen05_mma_ashift(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + // expected-error @below {{Cannot use collector buffer operation fill or use with ashift}} + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4nvf4_block_scale_default +llvm.func @nvvm_tcgen05_mma_mxf4nvf4_block_scale_default(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scalea: !llvm.ptr<6>, %scaleb: !llvm.ptr<6>) { + // expected-error @below {{mxf4nvf4 requires block scale attribute}} + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scalea, %scaleb + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4_block_scale_default +llvm.func @nvvm_tcgen05_mma_mxf4_block_scale_default(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scalea: !llvm.ptr<6>, %scaleb: !llvm.ptr<6>) { + // expected-error @below {{mxf4 kind does not support block16 attribute}} + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scalea, %scaleb + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, ashift, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLanev4: vector<4 x i32>, %disableOutputLanev8: vector<8 x i32>, %spmetadata: !llvm.ptr<6>) { + // expected-error @below {{Disable Output Lane of length 8 is incompatible with CtaGroupAttr}} + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLanev8 + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLanev4: vector<4 x i32>, %disableOutputLanev8: vector<8 x i32>, %spmetadata: !llvm.ptr<6>) { + // expected-error @below {{Disable Output Lane of length 8 is incompatible with CtaGroupAttr}} + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLanev8 + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_sp_mma_shared_ashift +llvm.func @nvvm_tcgen05_sp_mma_shared_ashift(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + // expected-error @below {{A-shift can be applied only when matrix A is in tensor memory}} + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_ashift +llvm.func @nvvm_tcgen05_mma_sp_ashift(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + // expected-error @below {{Cannot use collector buffer operation fill or use with ashift}} + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_default +llvm.func @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_default(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scalea: !llvm.ptr<6>, %scaleb: !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + // expected-error @below {{mxf4nvf4 requires block scale attribute}} + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scalea, %scaleb + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4_block_scale_default +llvm.func @nvvm_tcgen05_mma_sp_mxf4_block_scale_default(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scalea: !llvm.ptr<6>, %scaleb: !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + // expected-error @below {{mxf4 kind does not support block16 attribute}} + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scalea, %scaleb + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, ashift, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-shared.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-shared.mlir new file mode 100644 index 0000000000000..286df36730e77 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-shared.mlir @@ -0,0 +1,442 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_cta_1 +llvm.func @nvvm_tcgen05_mma_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_cta_2 +llvm.func @nvvm_tcgen05_mma_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + llvm.return +} + + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_cta_1 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_cta_2 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane : vector<4 x i32>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<4 x i32>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-shared.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-shared.mlir new file mode 100644 index 0000000000000..5c7eabee71b4e --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-shared.mlir @@ -0,0 +1,229 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-tensor.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-tensor.mlir new file mode 100644 index 0000000000000..3200411aee213 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-tensor.mlir @@ -0,0 +1,229 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-shared.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-shared.mlir new file mode 100644 index 0000000000000..96044cf669d63 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-shared.mlir @@ -0,0 +1,442 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + llvm.return +} + + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane : vector<4 x i32>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<4 x i32>, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-tensor.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-tensor.mlir new file mode 100644 index 0000000000000..709beb0508bb8 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-tensor.mlir @@ -0,0 +1,634 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + llvm.return +} + + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane : vector<4 x i32>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<4 x i32>, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-tensor.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-tensor.mlir new file mode 100644 index 0000000000000..798e311778beb --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-tensor.mlir @@ -0,0 +1,633 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_cta_1 +llvm.func @nvvm_tcgen05_mma_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 1, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 1, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 1, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 1, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 1, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 1, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 1, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 1, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 1, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 1, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 1, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 1, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_cta_2 +llvm.func @nvvm_tcgen05_mma_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 2, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 2, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 2, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 2, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 2, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 2, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 2, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 2, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 2, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 2, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 2, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 2, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_cta_1 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_cta_2 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane : vector<4 x i32>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<4 x i32>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-shared.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-shared.mlir new file mode 100644 index 0000000000000..5f1aeb05888bd --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-shared.mlir @@ -0,0 +1,133 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws +llvm.func @nvvm_tcgen05_mma_ws(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws_zero_col_mask +llvm.func @nvvm_tcgen05_mma_ws_zero_col_mask(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %zero_col_mask: i64) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-shared.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-shared.mlir new file mode 100644 index 0000000000000..e390e350090ad --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-shared.mlir @@ -0,0 +1,133 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws_sp +llvm.func @nvvm_tcgen05_mma_ws_sp(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws_sp_zero_col_mask +llvm.func @nvvm_tcgen05_mma_ws_sp_zero_col_mask(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>, %zero_col_mask: i64) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-tensor.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-tensor.mlir new file mode 100644 index 0000000000000..f7ce5484803e9 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-tensor.mlir @@ -0,0 +1,133 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws_sp +llvm.func @nvvm_tcgen05_mma_ws_sp(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws_sp_zero_col_mask +llvm.func @nvvm_tcgen05_mma_ws_sp_zero_col_mask(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>, %zero_col_mask: i64) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-tensor.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-tensor.mlir new file mode 100644 index 0000000000000..cecbb3fbd90af --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-tensor.mlir @@ -0,0 +1,133 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws +llvm.func @nvvm_tcgen05_mma_ws(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws_zero_col_mask +llvm.func @nvvm_tcgen05_mma_ws_zero_col_mask(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %zero_col_mask: i64) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + llvm.return +} From 200793ac218735e2186e9f2850f8e74a28c36a27 Mon Sep 17 00:00:00 2001 From: CarolineConcatto Date: Tue, 18 Nov 2025 11:10:58 +0000 Subject: [PATCH 29/35] Extend MemoryEffects to Support Target-Specific Memory Locations (#148650) This patch introduces preliminary support for additional memory locations. They are: target_mem0 and target_mem1 and they model memory locations that cannot be represented with existing memory locations. It was a solution suggested in : https://discourse.llvm.org/t/rfc-improving-fpmr-handling-for-fp8-intrinsics-in-llvm/86868/6 Currently, these locations are not yet target-specific. The goal is to enable the compiler to express read/write effects on these resources. --- .../CodeGen/sanitize-metadata-nosanitize.c | 18 ++--- clang/test/CodeGenOpenCL/convergent.cl | 2 +- llvm/docs/LangRef.rst | 8 +- llvm/include/llvm/AsmParser/LLToken.h | 2 + llvm/include/llvm/IR/Intrinsics.td | 19 +++++ llvm/include/llvm/Support/ModRef.h | 7 +- llvm/lib/AsmParser/LLLexer.cpp | 2 + llvm/lib/AsmParser/LLParser.cpp | 4 + llvm/lib/IR/Attributes.cpp | 6 ++ llvm/lib/Support/ModRef.cpp | 6 ++ llvm/test/Assembler/memory-attribute.ll | 55 +++++++++++++ llvm/test/Bitcode/memory-attribute-upgrade.ll | 4 +- .../TableGen/target-mem-intrinsic-attrs.td | 78 +++++++++++++++++++ .../Transforms/FunctionAttrs/argmemonly.ll | 22 +++--- .../Transforms/FunctionAttrs/nocapture.ll | 38 ++++----- .../FunctionAttrs/read-write-scc.ll | 4 +- .../Transforms/FunctionAttrs/readattrs.ll | 2 +- .../Transforms/FunctionAttrs/writeonly.ll | 4 +- .../InferFunctionAttrs/norecurse_debug.ll | 2 +- .../cfi-nounwind-direct-call.ll | 2 +- .../Transforms/SCCP/ipscp-drop-argmemonly.ll | 10 +-- llvm/unittests/Support/ModRefTest.cpp | 2 +- .../TableGen/Basic/CodeGenIntrinsics.cpp | 31 +++++++- llvm/utils/TableGen/Basic/CodeGenIntrinsics.h | 2 + .../utils/TableGen/Basic/IntrinsicEmitter.cpp | 4 +- mlir/test/Target/LLVMIR/llvmir.mlir | 10 +-- 26 files changed, 279 insertions(+), 65 deletions(-) create mode 100644 llvm/test/TableGen/target-mem-intrinsic-attrs.td diff --git a/clang/test/CodeGen/sanitize-metadata-nosanitize.c b/clang/test/CodeGen/sanitize-metadata-nosanitize.c index f2672d7f89157..74b5c9b03754f 100644 --- a/clang/test/CodeGen/sanitize-metadata-nosanitize.c +++ b/clang/test/CodeGen/sanitize-metadata-nosanitize.c @@ -10,7 +10,7 @@ // CHECK: @llvm.global_ctors = appending global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 2, ptr @__sanitizer_metadata_covered2.module_ctor, ptr @__sanitizer_metadata_covered2.module_ctor }, { i32, ptr, ptr } { i32 2, ptr @__sanitizer_metadata_atomics2.module_ctor, ptr @__sanitizer_metadata_atomics2.module_ctor }] // CHECK: @llvm.global_dtors = appending global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 2, ptr @__sanitizer_metadata_covered2.module_dtor, ptr @__sanitizer_metadata_covered2.module_dtor }, { i32, ptr, ptr } { i32 2, ptr @__sanitizer_metadata_atomics2.module_dtor, ptr @__sanitizer_metadata_atomics2.module_dtor }] //. -// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) +// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) // CHECK-LABEL: define dso_local void @escape( // CHECK-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !pcsections [[META6:![0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] @@ -21,7 +21,7 @@ __attribute__((noinline, not_tail_called)) void escape(const volatile void *p) { sink = p; } -// CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) +// CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) // CHECK-LABEL: define dso_local i32 @normal_function( // CHECK-SAME: ptr noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] !pcsections [[META8:![0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] @@ -38,7 +38,7 @@ int normal_function(int *x, int *y) { return *y; } -// CHECK: Function Attrs: disable_sanitizer_instrumentation mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) +// CHECK: Function Attrs: disable_sanitizer_instrumentation mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) // CHECK-LABEL: define dso_local i32 @test_disable_sanitize_instrumentation( // CHECK-SAME: ptr noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] @@ -55,7 +55,7 @@ __attribute__((disable_sanitizer_instrumentation)) int test_disable_sanitize_ins return *y; } -// CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) +// CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) // CHECK-LABEL: define dso_local i32 @test_no_sanitize_thread( // CHECK-SAME: ptr noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] !pcsections [[META14:![0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] @@ -72,7 +72,7 @@ __attribute__((no_sanitize("thread"))) int test_no_sanitize_thread(int *x, int * return *y; } -// CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) +// CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) // CHECK-LABEL: define dso_local i32 @test_no_sanitize_all( // CHECK-SAME: ptr noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] !pcsections [[META14]] { // CHECK-NEXT: [[ENTRY:.*:]] @@ -89,10 +89,10 @@ __attribute__((no_sanitize("all"))) int test_no_sanitize_all(int *x, int *y) { return *y; } //. -// CHECK: attributes #[[ATTR0]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } -// CHECK: attributes #[[ATTR1]] = { mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } -// CHECK: attributes #[[ATTR2]] = { disable_sanitizer_instrumentation mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } -// CHECK: attributes #[[ATTR3]] = { mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "no_sanitize_thread" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } +// CHECK: attributes #[[ATTR0]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } +// CHECK: attributes #[[ATTR1]] = { mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } +// CHECK: attributes #[[ATTR2]] = { disable_sanitizer_instrumentation mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } +// CHECK: attributes #[[ATTR3]] = { mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) "min-legal-vector-width"="0" "no-trapping-math"="true" "no_sanitize_thread" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } // CHECK: attributes #[[ATTR4:[0-9]+]] = { nounwind "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } //. // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} diff --git a/clang/test/CodeGenOpenCL/convergent.cl b/clang/test/CodeGenOpenCL/convergent.cl index 53a35a4f73119..99d9ee74e669b 100644 --- a/clang/test/CodeGenOpenCL/convergent.cl +++ b/clang/test/CodeGenOpenCL/convergent.cl @@ -133,7 +133,7 @@ kernel void assume_convergent_asm() __asm__ volatile("s_barrier"); } -// CHECK: attributes #0 = { nofree noinline norecurse nounwind " +// CHECK: attributes #0 = { nofree noinline norecurse nounwind memory(readwrite, target_mem0: none, target_mem1: none) " // CHECK: attributes #1 = { {{[^}]*}}convergent{{[^}]*}} } // CHECK: attributes #2 = { {{[^}]*}}convergent{{[^}]*}} } // CHECK: attributes #3 = { {{[^}]*}}convergent noduplicate{{[^}]*}} } diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 1a8886dd79c9c..734778f73af5f 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -2178,7 +2178,8 @@ For example: This attribute specifies the possible memory effects of the call-site or function. It allows specifying the possible access kinds (``none``, ``read``, ``write``, or ``readwrite``) for the possible memory location - kinds (``argmem``, ``inaccessiblemem``, ``errnomem``, as well as a default). + kinds (``argmem``, ``inaccessiblemem``, ``errnomem``, ``target_mem0``, + ``target_mem1``, as well as a default). It is best understood by example: - ``memory(none)``: Does not access any memory. @@ -2220,6 +2221,11 @@ For example: accessing inaccessible memory itself). Inaccessible memory is often used to model control dependencies of intrinsics. - ``errnomem``: This refers to accesses to the ``errno`` variable. + - ``target_mem#`` : These refer to target specific state that cannot be + accessed by any other means. # is a number between 0 and 1 inclusive. + Note: The target_mem locations are experimental and intended for internal + testing only. They must not be used in production code. + - The default access kind (specified without a location prefix) applies to all locations that haven't been specified explicitly, including those that don't currently have a dedicated location kind (e.g., accesses to globals diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h index 6de99fe182ad9..24f84cfa09e34 100644 --- a/llvm/include/llvm/AsmParser/LLToken.h +++ b/llvm/include/llvm/AsmParser/LLToken.h @@ -206,6 +206,8 @@ enum Kind { kw_readwrite, kw_argmem, kw_inaccessiblemem, + kw_target_mem0, + kw_target_mem1, kw_errnomem, // Legacy attributes: diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 27f404a1be65c..8f3cc54747074 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -54,6 +54,25 @@ def IntrInaccessibleMemOnly : IntrinsicProperty; // by the module being compiled. This is a weaker form of IntrArgMemOnly. def IntrInaccessibleMemOrArgMemOnly : IntrinsicProperty; +// Tablegen representation of IRMemLocation. +class IntrinsicMemoryLocation; + +// TODO: Populate with all IRMemLocation enum values and update +// getValueAsIRMemLocation accordingly. +def InaccessibleMem : IntrinsicMemoryLocation; +def TargetMem0 : IntrinsicMemoryLocation; +def TargetMem1 : IntrinsicMemoryLocation; + +// The list of IRMemoryLocations that are read from. +class IntrRead idx> : IntrinsicProperty { + list MemLoc=idx; +} + +// The list of IRMemoryLocations that are write to. +class IntrWrite idx> : IntrinsicProperty { + list MemLoc=idx; +} + // Commutative - This intrinsic is commutative: X op Y == Y op X. def Commutative : IntrinsicProperty; diff --git a/llvm/include/llvm/Support/ModRef.h b/llvm/include/llvm/Support/ModRef.h index 71f3b5bcb9c2b..34f116e478966 100644 --- a/llvm/include/llvm/Support/ModRef.h +++ b/llvm/include/llvm/Support/ModRef.h @@ -66,10 +66,15 @@ enum class IRMemLocation { ErrnoMem = 2, /// Any other memory. Other = 3, + /// Represents target specific state. + TargetMem0 = 4, + TargetMem1 = 5, /// Helpers to iterate all locations in the MemoryEffectsBase class. First = ArgMem, - Last = Other, + FirstTarget = TargetMem0, + // TargetMem IDs must be at the end of the list. + Last = TargetMem1, }; template class MemoryEffectsBase { diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index 7a6c19ece92ac..ebca344ae7b93 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -707,6 +707,8 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(write); KEYWORD(readwrite); KEYWORD(argmem); + KEYWORD(target_mem0); + KEYWORD(target_mem1); KEYWORD(inaccessiblemem); KEYWORD(errnomem); KEYWORD(argmemonly); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 8e3ce4990f437..921462e28a467 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -2552,6 +2552,10 @@ static std::optional keywordToLoc(lltok::Kind Tok) { return IRMemLocation::InaccessibleMem; case lltok::kw_errnomem: return IRMemLocation::ErrnoMem; + case lltok::kw_target_mem0: + return IRMemLocation::TargetMem0; + case lltok::kw_target_mem1: + return IRMemLocation::TargetMem1; default: return std::nullopt; } diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp index 4ac2ebd55dcac..fe6d3e5edeb09 100644 --- a/llvm/lib/IR/Attributes.cpp +++ b/llvm/lib/IR/Attributes.cpp @@ -656,6 +656,12 @@ std::string Attribute::getAsString(bool InAttrGrp) const { break; case IRMemLocation::Other: llvm_unreachable("This is represented as the default access kind"); + case IRMemLocation::TargetMem0: + OS << "target_mem0: "; + break; + case IRMemLocation::TargetMem1: + OS << "target_mem1: "; + break; } OS << getModRefStr(MR); } diff --git a/llvm/lib/Support/ModRef.cpp b/llvm/lib/Support/ModRef.cpp index 2bb9bc945bd2e..1083c72902c0b 100644 --- a/llvm/lib/Support/ModRef.cpp +++ b/llvm/lib/Support/ModRef.cpp @@ -49,6 +49,12 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, MemoryEffects ME) { case IRMemLocation::Other: OS << "Other: "; break; + case IRMemLocation::TargetMem0: + OS << "TargetMem0: "; + break; + case IRMemLocation::TargetMem1: + OS << "TargetMem1: "; + break; } OS << ME.getModRef(Loc); }); diff --git a/llvm/test/Assembler/memory-attribute.ll b/llvm/test/Assembler/memory-attribute.ll index effd4ce7c4548..4c86f8df0e6c1 100644 --- a/llvm/test/Assembler/memory-attribute.ll +++ b/llvm/test/Assembler/memory-attribute.ll @@ -78,3 +78,58 @@ declare void @fn_argmem_read_inaccessiblemem_write() ; CHECK: @fn_argmem_read_inaccessiblemem_write_reordered() declare void @fn_argmem_read_inaccessiblemem_write_reordered() memory(inaccessiblemem: write, argmem: read) + +; CHECK: Function Attrs: memory(target_mem0: write) +; CHECK: @fn_write_mem_target0() +declare void @fn_write_mem_target0() + memory(target_mem0: write) + +; CHECK: Function Attrs: memory(target_mem0: read) +; CHECK: @fn_read_mem_target0() +declare void @fn_read_mem_target0() + memory(target_mem0: read) + +; CHECK: Function Attrs: memory(target_mem1: write) +; CHECK: @fn_write_target_mem1() +declare void @fn_write_target_mem1() + memory(target_mem1: write) + +; CHECK: Function Attrs: memory(target_mem1: read) +; CHECK: @fn_read_target_mem1() +declare void @fn_read_target_mem1() + memory(target_mem1: read) + +; CHECK: Function Attrs: memory(target_mem0: read, target_mem1: write) +; CHECK: @fn_read_target_mem0_write_mem_target1() +declare void @fn_read_target_mem0_write_mem_target1() + memory(target_mem0: read, target_mem1: write) + +; CHECK: Function Attrs: memory(inaccessiblemem: write) +; CHECK: @fn_inaccessiblemem_write_new() +declare void @fn_inaccessiblemem_write_new() + memory(inaccessiblemem: write) + +; CHECK: Function Attrs: memory(inaccessiblemem: read, target_mem0: read, target_mem1: read) +; CHECK: @fn_inaccessiblemem_target_mem0_1read() +declare void @fn_inaccessiblemem_target_mem0_1read() + memory(inaccessiblemem: read, target_mem0: read, target_mem1: read) + +; CHECK: Function Attrs: memory(target_mem0: read) +; CHECK: @fn_inaccessiblemem_none_target_mem0_read() +declare void @fn_inaccessiblemem_none_target_mem0_read() + memory(inaccessiblemem: none, target_mem0: read) + +; CHECK: Function Attrs: memory(write, inaccessiblemem: read) +; CHECK: @fn_write_inaccessiblemem_read_target_mem0_write +declare void @fn_write_inaccessiblemem_read_target_mem0_write() + memory(write, inaccessiblemem: read, target_mem0: write) + +; CHECK: Function Attrs: memory(write, target_mem0: read) +; CHECK: @fn_write_inaccessiblemem_write_target_mem0_read() +declare void @fn_write_inaccessiblemem_write_target_mem0_read() + memory(write, inaccessiblemem: write, target_mem0: read) + +; CHECK: Function Attrs: memory(write, target_mem0: read) +; CHECK: @fn_write_target_mem0_readwrite() +declare void @fn_write_target_mem0_readwrite() + memory(write, target_mem0: read) diff --git a/llvm/test/Bitcode/memory-attribute-upgrade.ll b/llvm/test/Bitcode/memory-attribute-upgrade.ll index 915b62a88935d..334a344b96f7f 100644 --- a/llvm/test/Bitcode/memory-attribute-upgrade.ll +++ b/llvm/test/Bitcode/memory-attribute-upgrade.ll @@ -1,7 +1,7 @@ ; RUN: llvm-dis < %S/Inputs/memory-attribute-upgrade.bc | FileCheck %s -; CHECK: ; Function Attrs: memory(write, argmem: read) +; CHECK: ; Function Attrs: memory(write, argmem: read, target_mem0: none, target_mem1: none) ; CHECK-NEXT: define void @test_any_write_argmem_read(ptr %p) -; CHECK: ; Function Attrs: memory(read, argmem: readwrite, inaccessiblemem: none) +; CHECK: ; Function Attrs: memory(read, argmem: readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; CHECK-NEXT: define void @test_any_read_argmem_readwrite(ptr %p) diff --git a/llvm/test/TableGen/target-mem-intrinsic-attrs.td b/llvm/test/TableGen/target-mem-intrinsic-attrs.td new file mode 100644 index 0000000000000..fc9c3321ad9e9 --- /dev/null +++ b/llvm/test/TableGen/target-mem-intrinsic-attrs.td @@ -0,0 +1,78 @@ +// RUN: llvm-tblgen -gen-intrinsic-impl -I %p/../../include -DTEST_INTRINSICS_SUPPRESS_DEFS %s | FileCheck %s + +include "llvm/IR/Intrinsics.td" + +def int_aarch64_get_target_mem0_mem1 : DefaultAttrsIntrinsic<[], [llvm_i64_ty], [IntrReadMem, IntrRead<[TargetMem0, TargetMem1]>]>; + +def int_aarch64_get_target_mem0_set_target_mem1 : DefaultAttrsIntrinsic<[], [llvm_i64_ty], [IntrRead<[TargetMem0]>, IntrWrite<[TargetMem1]>]>; + +def int_aarch64_get_target_mem1 : DefaultAttrsIntrinsic<[], [llvm_i64_ty], [IntrReadMem, IntrRead<[TargetMem1]>]>; + +def int_aarch64_get_target_mem1_set_target_mem1 : DefaultAttrsIntrinsic<[], [llvm_i64_ty], [IntrRead<[TargetMem1]>, IntrWrite<[TargetMem1]>]>; + +def int_aarch64_set_inaccessible_mem : DefaultAttrsIntrinsic<[], [llvm_i64_ty], [IntrWriteMem, IntrWrite<[InaccessibleMem]>]>; + +def int_aarch64_set_target_mem0 : DefaultAttrsIntrinsic<[], [llvm_i64_ty], [IntrWriteMem, IntrWrite<[TargetMem0]>]>; + +// CHECK: static AttributeSet getIntrinsicFnAttributeSet(LLVMContext &C, unsigned ID) { +// CHECK-NEXT: switch (ID) { +// CHECK-NEXT: default: llvm_unreachable("Invalid attribute set number"); +// CHECK-NEXT: case 0: // llvm.aarch64.get.target.mem0.mem1 +// CHECK-NEXT: return AttributeSet::get(C, { +// CHECK-NEXT: Attribute::get(C, Attribute::NoUnwind), +// CHECK-NEXT: Attribute::get(C, Attribute::NoCallback), +// CHECK-NEXT: Attribute::get(C, Attribute::NoSync), +// CHECK-NEXT: Attribute::get(C, Attribute::NoFree), +// CHECK-NEXT: Attribute::get(C, Attribute::WillReturn), +// CHECK-NEXT: // ArgMem: NoModRef, InaccessibleMem: NoModRef, ErrnoMem: NoModRef, Other: NoModRef, TargetMem0: Ref, TargetMem1: Ref +// CHECK-NEXT: Attribute::getWithMemoryEffects(C, MemoryEffects::createFromIntValue(1280)), +// CHECK-NEXT: }); +// CHECK-NEXT: case 1: // llvm.aarch64.get.target.mem0.set.target.mem1 +// CHECK-NEXT: return AttributeSet::get(C, { +// CHECK-NEXT: Attribute::get(C, Attribute::NoUnwind), +// CHECK-NEXT: Attribute::get(C, Attribute::NoCallback), +// CHECK-NEXT: Attribute::get(C, Attribute::NoSync), +// CHECK-NEXT: Attribute::get(C, Attribute::NoFree), +// CHECK-NEXT: Attribute::get(C, Attribute::WillReturn), +// CHECK-NEXT: // ArgMem: NoModRef, InaccessibleMem: NoModRef, ErrnoMem: NoModRef, Other: NoModRef, TargetMem0: Ref, TargetMem1: Mod +// CHECK-NEXT: Attribute::getWithMemoryEffects(C, MemoryEffects::createFromIntValue(2304)), +// CHECK-NEXT: }); +// CHECK-NEXT: case 2: // llvm.aarch64.get.target.mem1 +// CHECK-NEXT: return AttributeSet::get(C, { +// CHECK-NEXT: Attribute::get(C, Attribute::NoUnwind), +// CHECK-NEXT: Attribute::get(C, Attribute::NoCallback), +// CHECK-NEXT: Attribute::get(C, Attribute::NoSync), +// CHECK-NEXT: Attribute::get(C, Attribute::NoFree), +// CHECK-NEXT: Attribute::get(C, Attribute::WillReturn), +// CHECK-NEXT: // ArgMem: NoModRef, InaccessibleMem: NoModRef, ErrnoMem: NoModRef, Other: NoModRef, TargetMem0: NoModRef, TargetMem1: Ref +// CHECK-NEXT: Attribute::getWithMemoryEffects(C, MemoryEffects::createFromIntValue(1024)), +// CHECK-NEXT: }); +// CHECK-NEXT: case 3: // llvm.aarch64.get.target.mem1.set.target.mem1 +// CHECK-NEXT: return AttributeSet::get(C, { +// CHECK-NEXT: Attribute::get(C, Attribute::NoUnwind), +// CHECK-NEXT: Attribute::get(C, Attribute::NoCallback), +// CHECK-NEXT: Attribute::get(C, Attribute::NoSync), +// CHECK-NEXT: Attribute::get(C, Attribute::NoFree), +// CHECK-NEXT: Attribute::get(C, Attribute::WillReturn), +// CHECK-NEXT: // ArgMem: NoModRef, InaccessibleMem: NoModRef, ErrnoMem: NoModRef, Other: NoModRef, TargetMem0: NoModRef, TargetMem1: ModRef +// CHECK-NEXT: Attribute::getWithMemoryEffects(C, MemoryEffects::createFromIntValue(3072)), +// CHECK-NEXT: }); +// CHECK-NEXT: case 4: // llvm.aarch64.set.inaccessible.mem +// CHECK-NEXT: return AttributeSet::get(C, { +// CHECK-NEXT: Attribute::get(C, Attribute::NoUnwind), +// CHECK-NEXT: Attribute::get(C, Attribute::NoCallback), +// CHECK-NEXT: Attribute::get(C, Attribute::NoSync), +// CHECK-NEXT: Attribute::get(C, Attribute::NoFree), +// CHECK-NEXT: Attribute::get(C, Attribute::WillReturn), +// CHECK-NEXT: // ArgMem: NoModRef, InaccessibleMem: Mod, ErrnoMem: NoModRef, Other: NoModRef, TargetMem0: NoModRef, TargetMem1: NoModRef +// CHECK-NEXT: Attribute::getWithMemoryEffects(C, MemoryEffects::createFromIntValue(8)), +// CHECK-NEXT: }); +// CHECK-NEXT: case 5: // llvm.aarch64.set.target.mem0 +// CHECK-NEXT: return AttributeSet::get(C, { +// CHECK-NEXT: Attribute::get(C, Attribute::NoUnwind), +// CHECK-NEXT: Attribute::get(C, Attribute::NoCallback), +// CHECK-NEXT: Attribute::get(C, Attribute::NoSync), +// CHECK-NEXT: Attribute::get(C, Attribute::NoFree), +// CHECK-NEXT: Attribute::get(C, Attribute::WillReturn), +// CHECK-NEXT: // ArgMem: NoModRef, InaccessibleMem: NoModRef, ErrnoMem: NoModRef, Other: NoModRef, TargetMem0: Mod, TargetMem1: NoModRef +// CHECK-NEXT: Attribute::getWithMemoryEffects(C, MemoryEffects::createFromIntValue(512)), diff --git a/llvm/test/Transforms/FunctionAttrs/argmemonly.ll b/llvm/test/Transforms/FunctionAttrs/argmemonly.ll index 42e0e94c1cee3..4ff36c0dbdc3f 100644 --- a/llvm/test/Transforms/FunctionAttrs/argmemonly.ll +++ b/llvm/test/Transforms/FunctionAttrs/argmemonly.ll @@ -56,7 +56,7 @@ entry: } define i32 @test_read_global() { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define i32 @test_read_global ; FNATTRS-SAME: () #[[ATTR2:[0-9]+]] { ; FNATTRS-NEXT: entry: @@ -76,7 +76,7 @@ entry: } define i32 @test_read_loaded_ptr(ptr %ptr) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define i32 @test_read_loaded_ptr ; FNATTRS-SAME: (ptr readonly captures(none) [[PTR:%.*]]) #[[ATTR3:[0-9]+]] { ; FNATTRS-NEXT: entry: @@ -119,7 +119,7 @@ entry: } define void @test_write_global() { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test_write_global ; FNATTRS-SAME: () #[[ATTR5:[0-9]+]] { ; FNATTRS-NEXT: entry: @@ -243,7 +243,7 @@ declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1) @arr = global [32 x i8] zeroinitializer define void @test_memcpy_src_global(ptr %dst) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test_memcpy_src_global ; FNATTRS-SAME: (ptr writeonly captures(none) initializes((0, 32)) [[DST:%.*]]) #[[ATTR11:[0-9]+]] { ; FNATTRS-NEXT: entry: @@ -263,7 +263,7 @@ entry: } define void @test_memcpy_dst_global(ptr %src) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test_memcpy_dst_global ; FNATTRS-SAME: (ptr readonly captures(none) [[SRC:%.*]]) #[[ATTR11]] { ; FNATTRS-NEXT: entry: @@ -388,7 +388,7 @@ define void @test_inaccessibleorargmemonly_readwrite(ptr %arg) { } define void @test_recursive_argmem_read(ptr %p) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(read, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(read, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test_recursive_argmem_read ; FNATTRS-SAME: (ptr readonly captures(none) [[P:%.*]]) #[[ATTR16:[0-9]+]] { ; FNATTRS-NEXT: [[PVAL:%.*]] = load ptr, ptr [[P]], align 8 @@ -408,7 +408,7 @@ define void @test_recursive_argmem_read(ptr %p) { } define void @test_recursive_argmem_readwrite(ptr %p) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(readwrite, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test_recursive_argmem_readwrite ; FNATTRS-SAME: (ptr captures(none) [[P:%.*]]) #[[ATTR17:[0-9]+]] { ; FNATTRS-NEXT: [[PVAL:%.*]] = load ptr, ptr [[P]], align 8 @@ -454,7 +454,7 @@ define void @test_recursive_argmem_read_alloca(ptr %p) { } define void @test_scc_argmem_read_1(ptr %p) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(read, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(read, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test_scc_argmem_read_1 ; FNATTRS-SAME: (ptr readonly captures(none) [[P:%.*]]) #[[ATTR16]] { ; FNATTRS-NEXT: [[PVAL:%.*]] = load ptr, ptr [[P]], align 8 @@ -474,7 +474,7 @@ define void @test_scc_argmem_read_1(ptr %p) { } define void @test_scc_argmem_read_2(ptr %p) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(read, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(read, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test_scc_argmem_read_2 ; FNATTRS-SAME: (ptr readonly captures(none) [[P:%.*]]) #[[ATTR16]] { ; FNATTRS-NEXT: call void @test_scc_argmem_read_1(ptr [[P]]) @@ -518,7 +518,7 @@ entry: ; FIXME: This could be `memory(argmem: read)`. define i64 @select_different_obj(i1 %c, ptr %p, ptr %p2) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define i64 @select_different_obj ; FNATTRS-SAME: (i1 [[C:%.*]], ptr readonly captures(none) [[P:%.*]], ptr readonly captures(none) [[P2:%.*]]) #[[ATTR3]] { ; FNATTRS-NEXT: entry: @@ -580,7 +580,7 @@ join: ; FIXME: This could be `memory(argmem: read)`. define i64 @phi_different_obj(i1 %c, ptr %p, ptr %p2) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define i64 @phi_different_obj ; FNATTRS-SAME: (i1 [[C:%.*]], ptr readonly captures(none) [[P:%.*]], ptr readonly captures(none) [[P2:%.*]]) #[[ATTR3]] { ; FNATTRS-NEXT: entry: diff --git a/llvm/test/Transforms/FunctionAttrs/nocapture.ll b/llvm/test/Transforms/FunctionAttrs/nocapture.ll index 8113ba65fe422..b5b14f571d47d 100644 --- a/llvm/test/Transforms/FunctionAttrs/nocapture.ll +++ b/llvm/test/Transforms/FunctionAttrs/nocapture.ll @@ -20,7 +20,7 @@ define ptr @c1(ptr %q) { ; It would also be acceptable to mark %q as readnone. Update @c3 too. define void @c2(ptr %q) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @c2 ; FNATTRS-SAME: (ptr [[Q:%.*]]) #[[ATTR1:[0-9]+]] { ; FNATTRS-NEXT: store ptr [[Q]], ptr @g, align 8 @@ -37,7 +37,7 @@ define void @c2(ptr %q) { } define void @c3(ptr %q) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @c3 ; FNATTRS-SAME: (ptr [[Q:%.*]]) #[[ATTR2:[0-9]+]] { ; FNATTRS-NEXT: call void @c2(ptr [[Q]]) @@ -127,7 +127,7 @@ l1: @lookup_table = global [2 x i1] [ i1 0, i1 1 ] define i1 @c5(ptr %q, i32 %bitno) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define i1 @c5 ; FNATTRS-SAME: (ptr [[Q:%.*]], i32 [[BITNO:%.*]]) #[[ATTR3:[0-9]+]] { ; FNATTRS-NEXT: [[TMP:%.*]] = ptrtoint ptr [[Q]] to i32 @@ -222,7 +222,7 @@ define ptr @lookup_bit(ptr %q, i32 %bitno) readnone nounwind { } define i1 @c7(ptr %q, i32 %bitno) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define i1 @c7 ; FNATTRS-SAME: (ptr readonly [[Q:%.*]], i32 [[BITNO:%.*]]) #[[ATTR6:[0-9]+]] { ; FNATTRS-NEXT: [[PTR:%.*]] = call ptr @lookup_bit(ptr [[Q]], i32 [[BITNO]]) @@ -243,7 +243,7 @@ define i1 @c7(ptr %q, i32 %bitno) { define i32 @nc1(ptr %q, ptr %p, i1 %b) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define i32 @nc1 ; FNATTRS-SAME: (ptr [[Q:%.*]], ptr captures(none) [[P:%.*]], i1 [[B:%.*]]) #[[ATTR7:[0-9]+]] { ; FNATTRS-NEXT: e: @@ -284,7 +284,7 @@ l: } define i32 @nc1_addrspace(ptr %q, ptr addrspace(1) %p, i1 %b) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define i32 @nc1_addrspace ; FNATTRS-SAME: (ptr [[Q:%.*]], ptr addrspace(1) captures(none) [[P:%.*]], i1 [[B:%.*]]) #[[ATTR7]] { ; FNATTRS-NEXT: e: @@ -328,7 +328,7 @@ l: } define void @nc2(ptr %p, ptr %q) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @nc2 ; FNATTRS-SAME: (ptr captures(none) [[P:%.*]], ptr [[Q:%.*]]) #[[ATTR7]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = call i32 @nc1(ptr [[Q]], ptr [[P]], i1 false) @@ -468,7 +468,7 @@ define void @self_readonly_nounwind_willreturn(ptr %p) readonly nounwind willret ; It would be acceptable to add readnone to %y1_1 and %y1_2. define void @test1_1(ptr %x1_1, ptr %y1_1, i1 %c) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test1_1 ; FNATTRS-SAME: (ptr readnone captures(none) [[X1_1:%.*]], ptr [[Y1_1:%.*]], i1 [[C:%.*]]) #[[ATTR12:[0-9]+]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = call ptr @test1_2(ptr [[X1_1]], ptr [[Y1_1]], i1 [[C]]) @@ -488,7 +488,7 @@ define void @test1_1(ptr %x1_1, ptr %y1_1, i1 %c) { } define ptr @test1_2(ptr %x1_2, ptr %y1_2, i1 %c) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define ptr @test1_2 ; FNATTRS-SAME: (ptr readnone captures(none) [[X1_2:%.*]], ptr returned [[Y1_2:%.*]], i1 [[C:%.*]]) #[[ATTR12]] { ; FNATTRS-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] @@ -520,7 +520,7 @@ f: } define void @test2(ptr %x2) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test2 ; FNATTRS-SAME: (ptr readnone captures(none) [[X2:%.*]]) #[[ATTR12]] { ; FNATTRS-NEXT: call void @test2(ptr [[X2]]) @@ -540,7 +540,7 @@ define void @test2(ptr %x2) { } define void @test3(ptr %x3, ptr %y3, ptr %z3) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test3 ; FNATTRS-SAME: (ptr readnone captures(none) [[X3:%.*]], ptr readnone captures(none) [[Y3:%.*]], ptr readnone captures(none) [[Z3:%.*]]) #[[ATTR12]] { ; FNATTRS-NEXT: call void @test3(ptr [[Z3]], ptr [[Y3]], ptr [[X3]]) @@ -560,7 +560,7 @@ define void @test3(ptr %x3, ptr %y3, ptr %z3) { } define void @test4_1(ptr %x4_1, i1 %c) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @test4_1 ; FNATTRS-SAME: (ptr [[X4_1:%.*]], i1 [[C:%.*]]) #[[ATTR12]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = call ptr @test4_2(ptr [[X4_1]], ptr [[X4_1]], ptr [[X4_1]], i1 [[C]]) @@ -580,7 +580,7 @@ define void @test4_1(ptr %x4_1, i1 %c) { } define ptr @test4_2(ptr %x4_2, ptr %y4_2, ptr %z4_2, i1 %c) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define ptr @test4_2 ; FNATTRS-SAME: (ptr readnone captures(none) [[X4_2:%.*]], ptr readnone returned captures(ret: address, provenance) [[Y4_2:%.*]], ptr readnone captures(none) [[Z4_2:%.*]], i1 [[C:%.*]]) #[[ATTR12]] { ; FNATTRS-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] @@ -744,7 +744,7 @@ entry: @g2 = global ptr null define void @captureLaunder(ptr %p) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: readwrite) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: readwrite, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @captureLaunder ; FNATTRS-SAME: (ptr [[P:%.*]]) #[[ATTR16:[0-9]+]] { ; FNATTRS-NEXT: [[B:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[P]]) @@ -788,7 +788,7 @@ entry: @g3 = global ptr null define void @captureStrip(ptr %p) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @captureStrip ; FNATTRS-SAME: (ptr [[P:%.*]]) #[[ATTR1]] { ; FNATTRS-NEXT: [[B:%.*]] = call ptr @llvm.strip.invariant.group.p0(ptr [[P]]) @@ -1086,7 +1086,7 @@ define i64 @captures_not_ret_only(ptr %p) { ;; Unlike ptrtoint, ptrtoaddr only captures the address define i64 @captures_ptrtoaddr_stored(ptr %p) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define noundef i64 @captures_ptrtoaddr_stored ; FNATTRS-SAME: (ptr captures(address) [[P:%.*]]) #[[ATTR1]] { ; FNATTRS-NEXT: [[INT:%.*]] = ptrtoaddr ptr [[P]] to i64 @@ -1189,7 +1189,7 @@ define ptr @captures_used_ret(ptr %p) { ; Make sure this is does not produce captures(ret: ...). We need to take the ; return capture components into account when handling argument SCCs. define ptr @scc_capture_via_ret(i1 %c, ptr %p) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define ptr @scc_capture_via_ret ; FNATTRS-SAME: (i1 [[C:%.*]], ptr [[P:%.*]]) #[[ATTR12]] { ; FNATTRS-NEXT: br i1 [[C]], label [[IF:%.*]], label [[ELSE:%.*]] @@ -1291,7 +1291,7 @@ define void @dont_increase_existing_captures_scc2(ptr %p) { } define void @addr_only_scc(ptr %p) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: read, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: read, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @addr_only_scc ; FNATTRS-SAME: (ptr readonly captures(address_is_null) [[P:%.*]]) #[[ATTR20:[0-9]+]] { ; FNATTRS-NEXT: [[V:%.*]] = load i8, ptr [[P]], align 1 @@ -1314,7 +1314,7 @@ define void @addr_only_scc(ptr %p) { } define void @addr_only_scc2(ptr %p) { -; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: read, inaccessiblemem: none) +; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: read, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define void @addr_only_scc2 ; FNATTRS-SAME: (ptr readonly captures(address_is_null) [[P:%.*]]) #[[ATTR20]] { ; FNATTRS-NEXT: [[CMP:%.*]] = icmp ne ptr [[P]], null diff --git a/llvm/test/Transforms/FunctionAttrs/read-write-scc.ll b/llvm/test/Transforms/FunctionAttrs/read-write-scc.ll index be61990fd6278..1fc0084203fca 100644 --- a/llvm/test/Transforms/FunctionAttrs/read-write-scc.ll +++ b/llvm/test/Transforms/FunctionAttrs/read-write-scc.ll @@ -4,7 +4,7 @@ @i = global i32 0 define void @foo() { -; CHECK: Function Attrs: nofree nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) +; CHECK: Function Attrs: nofree nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; CHECK-LABEL: define {{[^@]+}}@foo ; CHECK-SAME: () #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: store i32 1, ptr @i, align 4 @@ -17,7 +17,7 @@ define void @foo() { } define void @bar() { -; CHECK: Function Attrs: nofree nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) +; CHECK: Function Attrs: nofree nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; CHECK-LABEL: define {{[^@]+}}@bar ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: [[I:%.*]] = load i32, ptr @i, align 4 diff --git a/llvm/test/Transforms/FunctionAttrs/readattrs.ll b/llvm/test/Transforms/FunctionAttrs/readattrs.ll index 87f64ed3c63bc..8fc72a1ab90b9 100644 --- a/llvm/test/Transforms/FunctionAttrs/readattrs.ll +++ b/llvm/test/Transforms/FunctionAttrs/readattrs.ll @@ -33,7 +33,7 @@ define void @test1_2(ptr %x1_2, ptr %y1_2, ptr %z1_2) { ; TODO: Missing with attributor-light: argmem: none, inaccessiblemem: none define ptr @test2(ptr %p) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define {{[^@]+}}@test2 ; FNATTRS-SAME: (ptr readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0:[0-9]+]] { ; FNATTRS-NEXT: store i32 0, ptr @x, align 4 diff --git a/llvm/test/Transforms/FunctionAttrs/writeonly.ll b/llvm/test/Transforms/FunctionAttrs/writeonly.ll index 88c6031613697..05ecb12c710ee 100644 --- a/llvm/test/Transforms/FunctionAttrs/writeonly.ll +++ b/llvm/test/Transforms/FunctionAttrs/writeonly.ll @@ -44,7 +44,7 @@ nouses-argworn-funro_entry: @d-ccc = internal global %_type_of_d-ccc <{ ptr null, i8 1, i8 13, i8 0, i8 -127 }>, align 8 define void @nouses-argworn-funwo(ptr writeonly %.aaa) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define {{[^@]+}}@nouses-argworn-funwo ; FNATTRS-SAME: (ptr readnone captures(none) [[DOTAAA:%.*]]) #[[ATTR2:[0-9]+]] { ; FNATTRS-NEXT: nouses-argworn-funwo_entry: @@ -82,7 +82,7 @@ define void @test_store(ptr %p) { @G = external global ptr define i8 @test_store_capture(ptr %p) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: read, inaccessiblemem: none) +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: read, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; FNATTRS-LABEL: define {{[^@]+}}@test_store_capture ; FNATTRS-SAME: (ptr [[P:%.*]]) #[[ATTR4:[0-9]+]] { ; FNATTRS-NEXT: store ptr [[P]], ptr @G, align 8 diff --git a/llvm/test/Transforms/InferFunctionAttrs/norecurse_debug.ll b/llvm/test/Transforms/InferFunctionAttrs/norecurse_debug.ll index c8568272d320f..89a09406e5f1d 100644 --- a/llvm/test/Transforms/InferFunctionAttrs/norecurse_debug.ll +++ b/llvm/test/Transforms/InferFunctionAttrs/norecurse_debug.ll @@ -52,5 +52,5 @@ attributes #1 = { nounwind readnone speculatable } !28 = !DILocation(line: 9, column: 18, scope: !2) !29 = !DILocation(line: 10, column: 1, scope: !2) -; CHECK: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: write, inaccessiblemem: none) } +; CHECK: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: write, inaccessiblemem: none, target_mem0: none, target_mem1: none) } ; CHECK-NOT: foo.coefficient1 diff --git a/llvm/test/Transforms/LowerTypeTests/cfi-nounwind-direct-call.ll b/llvm/test/Transforms/LowerTypeTests/cfi-nounwind-direct-call.ll index 2795333effd76..89c32fab54a4c 100644 --- a/llvm/test/Transforms/LowerTypeTests/cfi-nounwind-direct-call.ll +++ b/llvm/test/Transforms/LowerTypeTests/cfi-nounwind-direct-call.ll @@ -117,7 +117,7 @@ attributes #6 = { noreturn nounwind } ; CHECK-NEXT: ret i32 [[DOT]] ; ; -; CHECK: Function Attrs: minsize mustprogress nofree norecurse nosync nounwind optsize willreturn memory(write, argmem: none, inaccessiblemem: none) +; CHECK: Function Attrs: minsize mustprogress nofree norecurse nosync nounwind optsize willreturn memory(write, argmem: none, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; CHECK-LABEL: define dso_local noundef range(i32 0, 2) i32 @_Z10call_catchi ; CHECK-SAME: (i32 noundef [[NUM:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] !type [[META4]] !type [[META5]] !type [[META6]] { ; CHECK-NEXT: entry: diff --git a/llvm/test/Transforms/SCCP/ipscp-drop-argmemonly.ll b/llvm/test/Transforms/SCCP/ipscp-drop-argmemonly.ll index 22726e0cac1f1..6a64dc3cddd39 100644 --- a/llvm/test/Transforms/SCCP/ipscp-drop-argmemonly.ll +++ b/llvm/test/Transforms/SCCP/ipscp-drop-argmemonly.ll @@ -14,7 +14,7 @@ ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 0 ;. define internal void @ptrarg.1(ptr %arg, i32 %val) argmemonly nounwind { -; CHECK: Function Attrs: nounwind memory(readwrite, inaccessiblemem: none) +; CHECK: Function Attrs: nounwind memory(readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; CHECK-LABEL: @ptrarg.1( ; CHECK-NEXT: store i32 10, ptr @g, align 4 ; CHECK-NEXT: ret void @@ -62,7 +62,7 @@ define void @caller.2(ptr %ptr) { ; Here the pointer argument %arg will be replaced by a constant. We need to ; drop inaccessiblemem_or_argmemonly. define internal void @ptrarg.3(ptr %arg, i32 %val) inaccessiblemem_or_argmemonly nounwind { -; CHECK: Function Attrs: nounwind memory(readwrite) +; CHECK: Function Attrs: nounwind memory(readwrite, target_mem0: none, target_mem1: none) ; CHECK-LABEL: @ptrarg.3( ; CHECK-NEXT: store i32 10, ptr @g, align 4 ; CHECK-NEXT: ret void @@ -110,7 +110,7 @@ define void @caller.4(ptr %ptr) { ; Here the pointer argument %arg will be replaced by a constant. We need to ; drop inaccessiblemem_or_argmemonly. define internal void @ptrarg.5(ptr %arg, i32 %val) argmemonly inaccessiblemem_or_argmemonly nounwind { -; CHECK: Function Attrs: nounwind memory(readwrite, inaccessiblemem: none) +; CHECK: Function Attrs: nounwind memory(readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) ; CHECK-LABEL: @ptrarg.5( ; CHECK-NEXT: store i32 10, ptr @g, align 4 ; CHECK-NEXT: ret void @@ -163,9 +163,9 @@ define i32 @caller.6.cs.attributes(i32 %n) { } ;. -; CHECK: attributes #[[ATTR0]] = { nounwind memory(readwrite, inaccessiblemem: none) } +; CHECK: attributes #[[ATTR0]] = { nounwind memory(readwrite, inaccessiblemem: none, target_mem0: none, target_mem1: none) } ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind memory(argmem: readwrite) } -; CHECK: attributes #[[ATTR2]] = { nounwind memory(readwrite) } +; CHECK: attributes #[[ATTR2]] = { nounwind memory(readwrite, target_mem0: none, target_mem1: none) } ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) } ; CHECK: attributes #[[ATTR4]] = { nounwind } ;. diff --git a/llvm/unittests/Support/ModRefTest.cpp b/llvm/unittests/Support/ModRefTest.cpp index 9c13908da44bb..128501bf2d957 100644 --- a/llvm/unittests/Support/ModRefTest.cpp +++ b/llvm/unittests/Support/ModRefTest.cpp @@ -21,7 +21,7 @@ TEST(ModRefTest, PrintMemoryEffects) { raw_string_ostream OS(S); OS << MemoryEffects::none(); EXPECT_EQ(S, "ArgMem: NoModRef, InaccessibleMem: NoModRef, ErrnoMem: " - "NoModRef, Other: NoModRef"); + "NoModRef, Other: NoModRef, TargetMem0: NoModRef, TargetMem1: NoModRef"); } } // namespace diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp index 228969ab37f85..d90fcc25502e2 100644 --- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp +++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp @@ -13,6 +13,7 @@ #include "CodeGenIntrinsics.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/TableGen/Error.h" @@ -377,7 +378,19 @@ void CodeGenIntrinsic::setProperty(const Record *R) { ME &= MemoryEffects::argMemOnly(); else if (R->getName() == "IntrInaccessibleMemOnly") ME &= MemoryEffects::inaccessibleMemOnly(); - else if (R->getName() == "IntrInaccessibleMemOrArgMemOnly") + else if (R->isSubClassOf("IntrRead")) { + MemoryEffects ReadMask = MemoryEffects::writeOnly(); + for (const Record *RLoc : R->getValueAsListOfDefs("MemLoc")) + ReadMask = ReadMask.getWithModRef(getValueAsIRMemLocation(RLoc), + ModRefInfo::ModRef); + ME &= ReadMask; + } else if (R->isSubClassOf("IntrWrite")) { + MemoryEffects WriteMask = MemoryEffects::readOnly(); + for (const Record *WLoc : R->getValueAsListOfDefs("MemLoc")) + WriteMask = WriteMask.getWithModRef(getValueAsIRMemLocation(WLoc), + ModRefInfo::ModRef); + ME &= WriteMask; + } else if (R->getName() == "IntrInaccessibleMemOrArgMemOnly") ME &= MemoryEffects::inaccessibleOrArgMemOnly(); else if (R->getName() == "Commutative") isCommutative = true; @@ -477,6 +490,22 @@ void CodeGenIntrinsic::setProperty(const Record *R) { } } +llvm::IRMemLocation +CodeGenIntrinsic::getValueAsIRMemLocation(const Record *R) const { + StringRef Name = R->getName(); + IRMemLocation Loc = + StringSwitch(Name) + .Case("TargetMem0", IRMemLocation::TargetMem0) + .Case("TargetMem1", IRMemLocation::TargetMem1) + .Case("InaccessibleMem", IRMemLocation::InaccessibleMem) + .Default(IRMemLocation::Other); // fallback enum + + if (Loc == IRMemLocation::Other) + PrintFatalError(R->getLoc(), "unknown IRMemLocation: " + Name); + + return Loc; +} + bool CodeGenIntrinsic::isParamAPointer(unsigned ParamIdx) const { if (ParamIdx >= IS.ParamTys.size()) return false; diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h index 6ac6f734326d8..305260a7ef4a9 100644 --- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h +++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h @@ -186,6 +186,8 @@ struct CodeGenIntrinsic { bool isParamImmArg(unsigned ParamIdx) const; + llvm::IRMemLocation getValueAsIRMemLocation(const Record *R) const; + CodeGenIntrinsic(const Record *R, const CodeGenIntrinsicContext &Ctx); }; diff --git a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp index 3ac23185ef91c..9fed5920a019f 100644 --- a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp +++ b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp @@ -599,10 +599,10 @@ static AttributeSet getIntrinsicFnAttributeSet(LLVMContext &C, unsigned ID) { if (!UniqFnAttributes.try_emplace(&Int, ID).second) continue; OS << formatv(R"( - case {}: + case {}: // {} return AttributeSet::get(C, {{ )", - ID); + ID, Int.Name); auto addAttribute = [&OS](StringRef Attr) { OS << formatv(" Attribute::get(C, Attribute::{}),\n", Attr); }; diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir index cc243c86ca902..0e087200b1116 100644 --- a/mlir/test/Target/LLVMIR/llvmir.mlir +++ b/mlir/test/Target/LLVMIR/llvmir.mlir @@ -2373,7 +2373,7 @@ llvm.func @readonly_function(%arg0: !llvm.ptr {llvm.readonly}) llvm.func @arg_mem_none_func() attributes { memory_effects = #llvm.memory_effects} -// CHECK: attributes #[[ATTR]] = { memory(readwrite, argmem: none, errnomem: none) } +// CHECK: attributes #[[ATTR]] = { memory(readwrite, argmem: none, errnomem: none, target_mem0: none, target_mem1: none) } // ----- @@ -2381,7 +2381,7 @@ llvm.func @arg_mem_none_func() attributes { llvm.func @readwrite_func() attributes { memory_effects = #llvm.memory_effects} -// CHECK: attributes #[[ATTR]] = { memory(readwrite, errnomem: none) } +// CHECK: attributes #[[ATTR]] = { memory(readwrite, errnomem: none, target_mem0: none, target_mem1: none) } // ----- @@ -2734,11 +2734,11 @@ llvm.func @mem_effects_call() { // CHECK: #[[ATTRS_0]] // CHECK-SAME: memory(none) // CHECK: #[[ATTRS_1]] -// CHECK-SAME: memory(read, argmem: none, inaccessiblemem: write, errnomem: none) +// CHECK-SAME: memory(read, argmem: none, inaccessiblemem: write, errnomem: none, target_mem0: none, target_mem1: none) // CHECK: #[[ATTRS_2]] -// CHECK-SAME: memory(read, inaccessiblemem: write, errnomem: none) +// CHECK-SAME: memory(read, inaccessiblemem: write, errnomem: none, target_mem0: none, target_mem1: none) // CHECK: #[[ATTRS_3]] -// CHECK-SAME: memory(readwrite, argmem: read, errnomem: none) +// CHECK-SAME: memory(readwrite, argmem: read, errnomem: none, target_mem0: none, target_mem1: none) // ----- From 5efce7392f3f6cc41f603a8d84070f05b277010a Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Tue, 18 Nov 2025 11:21:23 +0000 Subject: [PATCH 30/35] [compiler-rt][ARM] Optimized mulsf3 and divsf3 (#168394) (Reland of #161546, fixing three build and test issues) This commit adds optimized assembly versions of single-precision float multiplication and division. Both functions are implemented in a style that can be assembled as either of Arm and Thumb2; for multiplication, a separate implementation is provided for Thumb1. Also, extensive new tests are added for multiplication and division. These implementations can be removed from the build by defining the cmake variable COMPILER_RT_ARM_OPTIMIZED_FP=OFF. Outlying parts of the functionality which are not on the fast path, such as NaN handling and underflow, are handled in helper functions written in C. These can be shared between the Arm/Thumb2 and Thumb1 implementations, and also reused by other optimized assembly functions we hope to add in future. --- .../cmake/Modules/CheckAssemblerFlag.cmake | 39 ++ compiler-rt/lib/builtins/CMakeLists.txt | 47 ++ compiler-rt/lib/builtins/arm/divsf3.S | 618 ++++++++++++++++++ compiler-rt/lib/builtins/arm/fnan2.c | 42 ++ compiler-rt/lib/builtins/arm/fnorm2.c | 62 ++ compiler-rt/lib/builtins/arm/funder.c | 78 +++ compiler-rt/lib/builtins/arm/mulsf3.S | 319 +++++++++ compiler-rt/lib/builtins/arm/thumb1/mulsf3.S | 251 +++++++ compiler-rt/test/builtins/CMakeLists.txt | 4 + compiler-rt/test/builtins/Unit/divsf3_test.c | 503 +++++++++++--- compiler-rt/test/builtins/Unit/mulsf3_test.c | 616 +++++++++++++++++ 11 files changed, 2484 insertions(+), 95 deletions(-) create mode 100644 compiler-rt/cmake/Modules/CheckAssemblerFlag.cmake create mode 100644 compiler-rt/lib/builtins/arm/divsf3.S create mode 100644 compiler-rt/lib/builtins/arm/fnan2.c create mode 100644 compiler-rt/lib/builtins/arm/fnorm2.c create mode 100644 compiler-rt/lib/builtins/arm/funder.c create mode 100644 compiler-rt/lib/builtins/arm/mulsf3.S create mode 100644 compiler-rt/lib/builtins/arm/thumb1/mulsf3.S create mode 100644 compiler-rt/test/builtins/Unit/mulsf3_test.c diff --git a/compiler-rt/cmake/Modules/CheckAssemblerFlag.cmake b/compiler-rt/cmake/Modules/CheckAssemblerFlag.cmake new file mode 100644 index 0000000000000..ace80ce5583c7 --- /dev/null +++ b/compiler-rt/cmake/Modules/CheckAssemblerFlag.cmake @@ -0,0 +1,39 @@ +# Helper function to find out whether the assembler supports a particular +# command-line flag. You'd like to use the standard check_compiler_flag(), but +# that only supports a fixed list of languages, and ASM isn't one of them. So +# we do it ourselves, by trying to assemble an empty source file. + +function(check_assembler_flag outvar flag) + if(NOT DEFINED "${outvar}") + if(NOT CMAKE_REQUIRED_QUIET) + message(CHECK_START "Checking for assembler flag ${flag}") + endif() + + # Stop try_compile from attempting to link the result of the assembly, so + # that we don't depend on having a working linker, and also don't have to + # figure out what special symbol like _start needs to be defined in the + # test input. + # + # This change is made within the dynamic scope of this function, so + # CMAKE_TRY_COMPILE_TARGET_TYPE will be restored to its previous value on + # return. + set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) + + # Try to assemble an empty file with a .S name, using the provided flag. + set(asm_source_file + ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CheckAssemblerFlag.S) + write_file(${asm_source_file} "") + try_compile(${outvar} + ${CMAKE_BINARY_DIR} + SOURCES ${asm_source_file} + COMPILE_DEFINITIONS ${flag}) + + if(NOT CMAKE_REQUIRED_QUIET) + if(${outvar}) + message(CHECK_PASS "Accepted") + else() + message(CHECK_FAIL "Not accepted") + endif() + endif() + endif() +endfunction() diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index 02e6ecfbdb60e..cfe1d818cf7c6 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -60,6 +60,7 @@ endif() include(builtin-config-ix) include(CMakeDependentOption) include(CMakePushCheckState) +include(CheckAssemblerFlag) option(COMPILER_RT_BUILTINS_HIDE_SYMBOLS "Do not export any symbols from the static library." ON) @@ -423,6 +424,40 @@ set(arm_or_thumb2_base_SOURCES ${GENERIC_SOURCES} ) +option(COMPILER_RT_ARM_OPTIMIZED_FP + "On 32-bit Arm, use optimized assembly implementations of FP arithmetic. Likely to increase code size, but be faster." ON) + +set(arm_or_thumb2_optimized_fp_SOURCES) +if(COMPILER_RT_ARM_OPTIMIZED_FP AND BUILTIN_SUPPORTED_ARCH MATCHES "arm") + check_assembler_flag(COMPILER_RT_HAS_MIMPLICIT_IT -mimplicit-it=always) + if(COMPILER_RT_HAS_MIMPLICIT_IT) + set(implicit_it_flag -mimplicit-it=always) + else() + check_assembler_flag( + COMPILER_RT_HAS_WA_MIMPLICIT_IT -Wa,-mimplicit-it=always) + if(COMPILER_RT_HAS_WA_MIMPLICIT_IT) + set(implicit_it_flag -Wa,-mimplicit-it=always) + else() + message(WARNING "Don't know how to set the -mimplicit-it=always flag in this assembler; not including Arm optimized implementations") + set(implicit_it_flag "") + endif() + endif() + + if(implicit_it_flag) + set(assembly_files + arm/mulsf3.S + arm/divsf3.S) + set_source_files_properties(${assembly_files} + PROPERTIES COMPILE_OPTIONS ${implicit_it_flag}) + set(arm_or_thumb2_optimized_fp_SOURCES + ${assembly_files} + arm/fnan2.c + arm/fnorm2.c + arm/funder.c + ) + endif() +endif() + set(arm_sync_SOURCES arm/sync_fetch_and_add_4.S arm/sync_fetch_and_add_8.S @@ -456,6 +491,16 @@ set(thumb1_base_SOURCES ${GENERIC_SOURCES} ) +if(COMPILER_RT_ARM_OPTIMIZED_FP) + set(thumb1_base_SOURCES + arm/thumb1/mulsf3.S + arm/fnan2.c + arm/fnorm2.c + arm/funder.c + ${thumb1_base_SOURCES} + ) +endif() + set(arm_EABI_RT_SOURCES arm/aeabi_cdcmp.S arm/aeabi_cdcmpeq_check_nan.c @@ -567,6 +612,7 @@ if(MINGW) arm/aeabi_uldivmod.S arm/chkstk.S ${arm_or_thumb2_base_SOURCES} + ${arm_or_thumb2_optimized_fp_SOURCES} ${arm_sync_SOURCES} ) @@ -577,6 +623,7 @@ elseif(NOT WIN32) # TODO the EABI sources should only be added to EABI targets set(arm_SOURCES ${arm_or_thumb2_base_SOURCES} + ${arm_or_thumb2_optimized_fp_SOURCES} ${arm_sync_SOURCES} ${arm_EABI_SOURCES} ${arm_Thumb1_SOURCES} diff --git a/compiler-rt/lib/builtins/arm/divsf3.S b/compiler-rt/lib/builtins/arm/divsf3.S new file mode 100644 index 0000000000000..faabd8225c344 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/divsf3.S @@ -0,0 +1,618 @@ +//===-- divsf3.S - single-precision floating point division ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements single-precision soft-float division with the IEEE-754 +// default rounding (to nearest, ties to even), in optimized AArch32 assembly +// language suitable to be built as either Arm or Thumb2. +// +//===----------------------------------------------------------------------===// + +#include "../assembly.h" + + + .syntax unified + .text + .p2align 2 + +#if __ARM_PCS_VFP +DEFINE_COMPILERRT_FUNCTION(__divsf3) + push {r4, lr} + vmov r0, s0 + vmov r1, s1 + bl __aeabi_fdiv + vmov s0, r0 + pop {r4, pc} +#else +DEFINE_COMPILERRT_FUNCTION_ALIAS(__divsf3, __aeabi_fdiv) +#endif + +DEFINE_COMPILERRT_FUNCTION(__aeabi_fdiv) + // Extract the exponents of the inputs into r2 and r3, occupying bits 16-23 + // of each register so that there will be space lower down to store extra + // data without exponent arithmetic carrying into it. In the process, check + // both exponents for 00 or FF and branch out of line to handle all the + // uncommon types of value (infinity, NaN, zero, denormals). + // + // Chaining conditional instructions like this means that the second + // instruction (setting up r3) might not be executed at all, so fdiv_uncommon + // will have to redo it just in case. That saves an instruction here, + // executed for _all_ inputs, and moves it to the uncommon path run for only + // some inputs. + mov r12, #0xFF0000 + ands r2, r12, r0, lsr #7 // r2 has exponent of numerator. (Is it 0?) + andsne r3, r12, r1, lsr #7 // r3 has exponent of denominator. (Is it 0?) + teqne r2, r12 // if neither was 0, is one FF? + teqne r3, r12 // or the other? + beq LOCAL_LABEL(uncommon) // branch out of line if any answer was yes + + // Calculate the output sign, which is always just the XOR of the input + // signs. Store it in bit 8 of r2, below the numerator exponent. + teq r0, r1 // is the output sign bit 1? + orrmi r2, r2, #0x100 // if so, set bit 8 of r2 + + // Isolate the mantissas of both values, by setting bit 23 of each one and + // clearing the 8 bits above that. + // + // In the process, swap the register allocations (which doesn't cost extra + // instructions if we do it as part of this manipulation). We want the + // numerator not to be in r0, because r0 is where we'll build up the quotient + // while subtracting things from the numerator. + orr r12, r0, #1 << 23 + orr r0, r1, #1 << 23 + bic r1, r12, #0xFF000000 + bic r0, r0, #0xFF000000 + +LOCAL_LABEL(div): + // Start of the main division. We get here knowing that: + // + // r0 = mantissa of denominator, with the leading 1 at bit 23 + // r1 = mantissa of numerator, similarly + // r2 = (exponent of numerator << 16) + (result sign << 8) + // r3 = (exponent of denominator << 16) + + push {r14} // we'll need an extra register + + // Calculate the initial result exponent by just subtracting the two input + // exponents. This doesn't affect the sign bit lower down in r2. + sub r2, r2, r3 + + // That initial exponent might need to be adjusted by 1, depending on whether + // dividing the mantissas gives a value >=1 or <1. We don't need to wait + // until the division is finished to work that out: we can tell immediately + // by just comparing the mantissas. + // + // The basic idea is to do the comparison in a way that sets the C flag if + // numerator >= denominator. Then we recombine the sign and exponent by doing + // "ADC r2, r2, r2, asr #16": the exponent in the top half of r2 is shifted + // down to the low 8 bits, just below the sign bit, and using ADC rather than + // ADD folds in the conditional increment from the mantissa comparison. + // + // If we're not incrementing the output exponent, we instead shift the + // numerator mantissa left by 1, so that it _is_ greater than the denominator + // mantissa. Otherwise we'd generate only a 22-bit quotient, instead of 23. + // + // The exponent also needs to be rebiased, so that dividing two numbers the + // same gives an output exponent of 0x7F. If the two inputs have the same + // exponent then we'll have computed an exponent of 0 via the SUB instruction + // above; if the mantissas are the same as well then the ADC will increment + // it; also, the leading bit of the quotient will increment the exponent + // again when we recombine it with the output mantissa later. So we need to + // add (0x7F - 2) to the mantissa now, to make an exponent of 0 from the SUB + // come to 0x7F after both of those increments. + // + // Putting all of that together, what we _want_ to do is this: + // + // [#1] CMP r1, r0 // set C if num >= den + // [#2] MOVLO r1, r1, lsl #1 // if num < den, shift num left + // [#3] ADD r2, r2, #0x7D0000 // rebias exponent + // [#4] ADC r2, r2, r2, asr #16 // combine sign + exp + adjustment + // + // However, we only do the first of those four instructions right here. The + // other three are distributed through the code below, after unrelated load + // or multiply instructions which will have a result delay slot on simple + // CPUs. Each is labelled "exponent setup [#n]" in a comment. + // + // (Since instruction #4 depends on the flags set up by #2, we must avoid + // clobbering the flags in _any_ of the instructions interleaved with this!) + cmp r1, r0 // exponent setup [#1] + + // Start the mantissa division by making an approximation to the reciprocal + // of the denominator. We first obtain an 8-bit approximation using a table + // lookup indexed by the top 7 denominator bits (counting the leading 1, so + // really there are only 6 bits in the table index). + // + // (r0 >> 17) is the table index, and its top bit is always set, so it ranges + // from 64 to 127 inclusive. So we point the base register 64 bytes before + // the actual table. + adr r12, LOCAL_LABEL(tab) - 64 +#if __thumb__ + // Thumb can't do this particular shift+add+load in one instruction - it only + // supports left shifts of 0 to 3 bits, not right shifts of 17. So we must + // calculate the load offset separately. + add r14, r12, r0, lsr #17 + ldrb r14, [r14] +#else + ldrb r14, [r12, r0, lsr #17] +#endif + + // Now do an iteration of Newton-Raphson to improve that 8-bit approximation + // to have 15-16 accurate bits. + // + // Basics of Newton-Raphson for finding a reciprocal: if you want to find 1/d + // and you have some approximation x, your next approximation is X = x(2-dx). + // Looked at one way, this is the result of applying the N-R formula + // X=x-f(x)/f'(x) to the function f(x) = 1/x - d. Another way to look at it + // is to suppose that dx = 1 - e, for some e which is small (because dx is + // already reasonably close to 1). Then you want to double the number of + // correct bits in the next approximation, i.e. square the error. So you want + // dX = 1-e^2 = (1-e)(1+e) = dx(2-dx). Cancelling d gives X = x(2-dx) again. + // + // In this situation, we're working in fixed-point integers rather than real + // numbers, and all the scales are different: + // * our input denominator d is in the range [2^23,2^24) + // * our input approximation x is in the range [2^7,2^8) + // * we want the output approximation to be in the range [2^15,2^16) + // Those factors combine to mean that we want + // x(2^32-dx) / 2^23 + // = (2^9 x) - (dx^2 / 2^23) + // + // But we also want to compute this using ordinary MUL, not a long multiply + // instruction (those are slower). So we need to worry about the product + // overflowing. dx fits in 32 bits, because it's the product of something + // <2^24 with something <2^8; but we must shift it right before multiplying + // by x again. + + mul r12, r0, r14 // r12 = dx + movlo r1, r1, lsl #1 // exponent setup [#2] in the MUL delay slot + mvn r12, r12, lsr #8 // r12 ~= -dx/2^8 + mul r3, r12, r14 // r3 ~= -dx^2/2^8 + mov r14, r14, lsl #9 // r14 = 2^9 x + add r14, r14, r3, asr #15 // r14 ~= 2^9 x - dx^2 / 2^23 + + // Now r14 is a 16-bit approximation to the reciprocal of the input mantissa, + // scaled by 2^39 (so that the min mantissa 2^23 would have reciprocal 2^16 + // in principle, and the max mantissa 2^24-1 would have reciprocal just over + // 2^15). The error is always negative (r14 is an underestimate of the true + // value), and the maximum error is 6 and a bit ULP (that is, the true + // reciprocal is strictly less than (r14+7)). Also, r14 is always strictly + // less than 0x10000 (even in the case of the min mantissa, where the true + // value would be _exactly_ 0x10000), which eliminates a case of integer + // overflow. + // + // All of these properties of the reciprocal approximation are checked by + // exhaustively iterating over all 2^23 possible input mantissas. (The nice + // thing about doing this in single rather than double precision!) + // + // Now we extract most of the quotient by two steps of long division, using + // the reciprocal estimate to identify a multiple of the denominator to + // subtract from the numerator. To avoid integer overflow, the numerator + // mantissa is shifted down 8 bits so that it's less than 0x10000. After we + // calculate an approximate quotient, we shift the numerator left and + // subtract that multiple of the denominator, moving the next portion of the + // numerator into range for the next iteration. + + // First iteration of long division. We shift the numerator left 11 bits, and + // since the quotient approximation is scaled by 2^31, we must shift that + // right by 20 to make the right product to subtract from the numerator. + mov r12, r1, lsr #8 // shift the numerator down + mul r12, r14, r12 // make the quotient approximation + mov r1, r1, lsl #11 // shift numerator left, ready for subtraction + mov r3, r12, lsr #20 // make first 12-bit block of quotient bits + mls r1, r0, r3, r1 // subtract that multiple of den from num + + add r2, r2, #0x7D0000 // exponent setup [#3] in the MLS delay slot + + // Second iteration of long division. Differences from the first step: this + // time we shift the numerator 12 bits instead of 11, so that the total of + // both steps is 23 bits, i.e. we've shifted up by exactly the full width of + // the output mantissa. Also, the block of output quotient bits is left in a + // different register: it was in r3 the first time, and this time it's in + // r12, so that we still have both available at the end of the process. + mov r12, r1, lsr #8 // shift the numerator down + mul r12, r14, r12 // make the quotient approximation + mov r1, r1, lsl #12 // shift numerator left, ready for subtraction + mov r12, r12, lsr #19 // make second 11-bit block of quotient + mls r1, r0, r12, r1 // subtract that multiple of den from num + + adc r2, r2, r2, asr #16 // exponent setup [#4] in the MLS delay slot + + // Now r1 contains the original numerator, shifted left 23, minus _some_ + // multiple of the original denominator (which is still in r0). The bounds on + // the error in the above steps should make the error at most 1: that is, we + // may have to subtract the denominator one more time to make r1 < r0, and + // increment the quotient by one more. + // + // Our quotient is still in two pieces, computed separately in the above long + // division steps. We fold the final increment into the same instruction that + // recombines them, by doing the comparison in such a way that it sets the + // carry flag if the increment is needed. + + cmp r1, r0 // Set carry flag if num >= den + subhs r1, r1, r0 // If so, subtract den from num + adc r3, r12, r3, lsl #12 // Recombine quotient halves, plus optional +1 + + // We've finished with r14 as a temporary register, so we can unstack it now. + pop {r14} + + // Now r3 contains the _rounded-down_ output quotient, and r1 contains the + // remainder. That is, (denominator * r3 + r1) = (numerator << 23), and + // 0 <= r1 < denominator. + // + // Next we must round to nearest, by checking if r1 is greater than half the + // denominator. In division, it's not possible to hit an exact round-to-even + // halfway case, so we don't need to spend any time checking for it. + // + // Proof of no round-to-even: define the 'width' of a dyadic rational to be + // the distance between the lowest and highest 1 bits in its binary + // representation, or equivalently, the index of its high bit if you scale it + // by a power of 2 to make it an odd integer. E.g. any actual power of 2 has + // width 0, and all of 0b11110, 0b1111, 0b11.11 and 0b0.01111 have width 3. + // Then for any dyadic rationals a,b, width(ab) >= width(a)+width(b). Let w + // be the maximum width that the input precision supports (so that for single + // precision, w=23). Then if some division n/d were a round-to-even case, the + // true quotient q=n/d would have width exactly w+1. But we have qd=n, so + // width(n) >= width(q)+width(d) > w, which can't happen, because n is in the + // input precision, hence had width <= w.) + // + // So we don't need to check for an exact _halfway_ case and clear the low + // bit of the quotient after rounding up, as addition and multiplication both + // need to do. But we do need to remember if the quotient itself was exact, + // that is, if there was no remainder at all. That's needed in underflow + // handling. + + // The rounding check wants to compare remainder with denominator/2. But of + // course in integers it's easier to compare 2*remainder with denominator. So + // we start by shifting the remainder left by 1, and in the process, set Z if + // it's exactly 0 (i.e. the result needs no rounding at all). + lsls r1, r1, #1 + // Now trial-subtract the denominator. We don't do this at all if the result + // was exact. If we do do it, r1 goes negative precisely if we need to round + // up, which sets the C flag. (The previous instruction will have left C + // clear, since r1 had its top 8 bits all clear. So now C is set _only_ if + // we're rounding up.) + subsne r1, r1, r0 + // Recombine the quotient with the sign + exponent, and use the C flag from + // the previous instruction to increment the quotient if we're rounding up. + adc r0, r3, r2, lsl #23 + + // If we haven't either overflowed or underflowed, we're done. We can + // identify most of the safe cases by doing an unsigned comparison of the + // initial output exponent (in the top half of r2) with 0xFC: if 0 <= r2 < + // 0xFC0000 then we have neither underflow nor overflow. + // + // Rationale: the value in the top half of r2 had three chances to be + // incremented before becoming the exponent field of the actual output float. + // It was incremented if we found the numerator mantissa was >= the + // denominator (producing the value in the _bottom_ half of r2, which we just + // ADCed into the output). Then it gets unconditionally incremented again + // when the ADC combines it with the leading mantissa bit. And finally, + // round-up might increment it a third time. So 0xFC is the smallest value + // that can possibly turn into the overflowed value 0xFF after all those + // increments. + // + // On the underflow side, (top half of r2) = 0 corresponds to a value of 1 in + // the final result's exponent field (and then rounding might increase it + // further); if the exponent was less than that then r2 wraps round and looks + // like a very large positive integer from the point of view of this unsigned + // comparison. + cmp r2, #0xFC0000 + bxlo lr + + // The same comparison will have set the N and V flags to reflect the result + // of comparing r2 with 0xFC0000 as a _signed_ integer. That reliably + // distinguishes potential underflow (r2 is negative) from potential overflow + // (r2 is positive and at least 0xFC0000) + bge LOCAL_LABEL(overflow) + + // Here we might or might not have underflow (but we know we don't have + // overflow). To check more carefully, we look at the _bottom_ half of r2, + // which contains the exponent after the first adjustment (for num >= denom), + // That is, it's still off by 1 (compensating for the leading quotient bit), + // and is also before rounding. + // + // We neglect the effect of rounding: division results that are tiny (less + // than the smallest normalised number) before rounding, but then round up to + // the smallest normal number, are an acceptable edge case to handle slowly. + // We pass those to funder without worrying about them. + // + // So we want to check whether the bottom half of r2 was negative. It would + // be nice to check bits 8-15 of it, but unfortunately, it's already been + // combined with the sign (at bit 8), so those bits don't tell us anything + // useful. Instead we look at the top 4 bits of the exponent field, i.e. the + // 0xF0 bits. The largest _non_-overflowing exponent that might reach here is + // less than 3, so it doesn't reach those bits; the smallest possible + // underflow, obtained by dividing the smallest denormal by the largest + // finite number, is -151 (before the leading bit increments it), which will + // set the low 8 bits of r2 to 0x69. That is, the 0xF0 nibble of r2 will be + // 0x60 or greater for a (pre-rounding) underflow, and zero for a + // non-underflow. + + tst r2, #0xF0 + bxeq lr // no underflow after all; return + + // Rebias the exponent for funder, which also corrects the sign bit. + add r0, r0, #192 << 23 + // Tell funder whether the true value is greater or less than the number in + // r0. This is obtained from the sign of the remainder (still in r1), with + // the only problem being that it's currently reversed. So negate r1 (leaving + // 0 at 0 to indicate exactness). + rsbs r1, r1, #0 + b SYMBOL_NAME(__compiler_rt_funder) + +LOCAL_LABEL(overflow): + // Here we might or might not have overflow (but we know we don't have + // underflow). We must check whether we really have overflowed. + // + // For this it's easiest to check the exponent field in the actual output + // value in r0, after _all_ the adjustments have been completed. The largest + // overflowed exponent is 0x193, and the smallest exponent that can reach + // this is 0xFD (we checked against 0xFC above, but then the leading quotient + // bit incremented it). So it's enough to shift the output left by one + // (moving the exponent field to the top), increment it once more (so that + // the smallest overflowed exponent 0xFF wraps round to 0), and then compare + // against 0xFE000000 as an unsigned integer. + mov r12, r0, lsl #1 + add r12, r12, #1 << 24 + cmp r12, #0xFE << 24 // Check for exp = 253 or 254 + bxhs lr + // We have actual overflow. Rebias r0 to bring the exponent back into range, + // which ensures its sign is correct. Then make an infinity of that sign to + // return. + subs r0, r0, #0xC0 << 23 + movs r12, #0xFF // exponent of infinity + orrs r12, r12, r0, lsr #23 // exponent and sign at bottom of r12 + movs r0, r12, lsl #23 // shift it up to the top of r0 to return + bx lr + +LOCAL_LABEL(uncommon): + // We come here from the start of the function if either input is an uncommon + // value: zero, denormal, infinity or NaN. + // + // We arrive here with r12 = 0xFF000000, and r2 containing the exponent of x + // in bits 16..23. But r3 doesn't necessarily contain the exponent of y, + // because the instruction that set it up was conditional. So first we + // unconditionally repeat it. + and r3, r12, r1, lsr #7 + + // In all cases not involving a NaN as output, the sign of the output is made + // in the same way as for finite numbers, as the XOR of the input signs. So + // repeat the sign setup from the main branch. + teq r0, r1 // is the output sign bit 1? + orrmi r2, r2, #0x100 // if so, set bit 8 of r2 + + // Detect infinities and NaNs, by checking if either of r2 or r3 is at least + // 0xFF0000. + cmp r2, #0xFF0000 + cmplo r3, #0xFF0000 + bhs LOCAL_LABEL(inf_NaN) + + // Now we know there are no infinities or NaNs, but there's at least one zero + // or denormal. + movs r12, r1, lsl #1 // is y zero? + beq LOCAL_LABEL(divbyzero) // if so, go and handle division by zero + movs r12, r0, lsl #1 // is x zero? (now we know that y is not) + moveq r0, r2, lsl #23 // if so, 0/nonzero is just 0 (of right sign) + bxeq lr + + // Now we've eliminated zeroes as well, leaving only denormals: either x or + // y, or both, is a denormal. Call fnorm2 to convert both into a normalised + // mantissa and a (potentially small) exponent. + and r12, r2, #0x100 // save the result sign from r2 + lsr r2, #16 // shift extracted exponents down to bit 0 + lsr r3, #16 // where fnorm2 will expect them + push {r0, r1, r2, r3, r12, lr} + mov r0, sp // tell fnorm2 where to find its data + bl SYMBOL_NAME(__compiler_rt_fnorm2) + pop {r0, r1, r2, r3, r12, lr} + lsl r3, #16 // shift exponents back up to bit 16 + orr r2, r12, r2, lsl #16 // and put the result sign back in r2 + + // Now rejoin the main code path, having finished the setup it will expect: + // swap x and y, and shift the fractions back down to the low 24 bits. + mov r12, r0, lsr #8 + mov r0, r1, lsr #8 + mov r1, r12 + b LOCAL_LABEL(div) + +LOCAL_LABEL(inf_NaN): + // We come here if at least one input is a NaN or infinity. If either or both + // inputs are NaN then we hand off to fnan2 to propagate a NaN from the + // input. + mov r12, #0xFF000000 + cmp r12, r0, lsl #1 // if (r0 << 1) > 0xFF000000, r0 is a NaN + blo SYMBOL_NAME(__compiler_rt_fnan2) + cmp r12, r1, lsl #1 + blo SYMBOL_NAME(__compiler_rt_fnan2) + + // No NaNs, so we have three options: inf/inf = NaN, inf/finite = inf, and + // finite/inf = 0. + + // If both operands are infinity, we return a NaN. Since we know at + // least _one_ is infinity, we can test this by checking if they're + // equal apart from the sign bits. + eor r3, r0, r1 + lsls r3, #1 // were all bits of XOR zero other than top? + beq LOCAL_LABEL(invalid) // if so, both operands are infinity + + // See if x is infinite + cmp r12, r0, lsl #1 // (r0 << 1) == 0xFF000000? + beq LOCAL_LABEL(infret) // if so, infinity/finite = infinity + + // y is infinite and x is not, so we return a zero of the + // combined sign. + eor r0, r0, r1 // calculate the right sign + and r0, r0, #0x80000000 // throw away everything else + bx lr + +LOCAL_LABEL(divbyzero): + // Here, we know y is zero. But we don't know if x is zero or nonzero. So we + // might be calculating 0/0 (invalid operation, generating a NaN), or + // nonzero/0 (the IEEE "division by zero" exception, generating infinity). + movs r12, r0, lsl #1 // is x zero too? + beq LOCAL_LABEL(invalid) // if so, go and return a NaN + +LOCAL_LABEL(infret): + // Here, we're either dividing infinity by a finite number, or dividing a + // nonzero number by 0. (Or both, if we're dividing infinity by 0.) In all + // these cases we return infinity with the sign from r2. + // + // If we were implementing IEEE exceptions, we'd have to separate these + // cases: infinity / finite is not an _exception_, it just returns infinity, + // whereas (finite and nonzero) / 0 is a division-by-zero exception. But here + // we're not implementing exceptions, so we can treat all three cases the + // same. + // + // r2 contains the output sign in bit 8, which is a convenient place to find + // it when making an infinity, because we can fill in the 8 exponent bits + // below that and then shift it left. + orr r2, r2, #0xff // sign + maximum exponent + lsl r0, r2, #23 // shift up to the top + bx lr + +LOCAL_LABEL(invalid): + // Return the default NaN, from an invalid operation (either dividing + // infinity by infinity, or 0 by 0). + ldr r0, =0x7FC00000 + bx lr + +// Finally, the lookup table for the initial reciprocal approximation. +// +// The table index is made from the top 7 bits of the denominator mantissa. But +// the topmost bit is always 1, so only the other 6 bits vary. So it only has +// 64 entries, not 128. +// +// Each table entry is a single byte, with its top bit set. So the table +// entries correspond to the reciprocal of a 7-bit mantissa prefix scaled up by +// 2^14, or the reciprocal of a whole 24-bit mantissa scaled up by 2^31. +// +// Each of these 64 entries corresponds to a large interval of possible +// mantissas. For example, if the top 7 bits are 1000001 then the overall +// mantissa could be anything from 0x820000 to 0x83FFFF. And because the output +// of this table provides more bits than the input, there are several choices +// of 8-bit reciprocal approximation for a number in that interval. The +// reciprocal of 0x820000 starts with 0xFC plus a fraction, and the reciprocal +// of 0x83FFFF starts with 0xF9 minus a fraction, so there are four reasonable +// choices for that table entry: F9, FA, FB or FC. Which do we pick? +// +// The table below is generated by choosing whichever value minimises the +// maximum possible error _after_ the approximation is improved by the +// Newton-Raphson step. In the example above, we end up with FA. +// +// The Python code below will regenerate the table, complete with the per-entry +// comments. + +/* + +for prefix in range(64, 128): + best = None + + # Max and min 23-bit mantissas with this 7-bit prefix + mmin, mmax = prefix * 2**17, (prefix + 1) * 2**17 - 1 + + # Max and min table entry corresponding to the reciprocal of something in + # that range of mantissas: round up the reciprocal of mmax, and round down + # the reciprocal of mmin. Also clamp to the range [0x80,0xff], because + # 0x100 can't be used as a table entry due to not fitting in a byte, even + # though it's the exact reciprocal of the overall-smallest mantissa + # 0x800000. + gmin = max(128, (2**31 + mmin - 1) // mmax) + gmax = min(255, 2**31 // mmin) + + # For each of those table entries, compute the result of starting from that + # value and doing a Newton-Raphson iteration, with the mantissa at each end + # of the mantissa interval. One of these will be the worst possible error. + # Choose the table entry whose worst error is as small as possible. + # + # (To find the extreme values of a more general function on an interval, + # you must consider its values not only at the interval endpoints but also + # any turning points within the interval. Here, the function has only one + # turning point, and by construction it takes value 0 there, so we needn't + # worry.) + g = max( + range(gmin, gmax + 1), + key=lambda g: min( + (g * (2**32 - d * g) / 2**23 - 2**39 / d) for d in [mmin, mmax] + ), + ) + + print(f" .byte 0x{g:02x} // input [0x{mmin:06x},0x{mmax:06x}]" + f", candidate outputs [0x{gmin:02x},0x{gmax:02x}]" + ) + +*/ + + .p2align 2 // make sure we start on a 4-byte boundary, even in Thumb +LOCAL_LABEL(tab): + .byte 0xfe // input [0x800000,0x81ffff], candidate outputs [0xfd,0xff] + .byte 0xfa // input [0x820000,0x83ffff], candidate outputs [0xf9,0xfc] + .byte 0xf6 // input [0x840000,0x85ffff], candidate outputs [0xf5,0xf8] + .byte 0xf3 // input [0x860000,0x87ffff], candidate outputs [0xf1,0xf4] + .byte 0xef // input [0x880000,0x89ffff], candidate outputs [0xee,0xf0] + .byte 0xec // input [0x8a0000,0x8bffff], candidate outputs [0xeb,0xed] + .byte 0xe8 // input [0x8c0000,0x8dffff], candidate outputs [0xe7,0xea] + .byte 0xe5 // input [0x8e0000,0x8fffff], candidate outputs [0xe4,0xe6] + .byte 0xe2 // input [0x900000,0x91ffff], candidate outputs [0xe1,0xe3] + .byte 0xdf // input [0x920000,0x93ffff], candidate outputs [0xde,0xe0] + .byte 0xdc // input [0x940000,0x95ffff], candidate outputs [0xdb,0xdd] + .byte 0xd9 // input [0x960000,0x97ffff], candidate outputs [0xd8,0xda] + .byte 0xd6 // input [0x980000,0x99ffff], candidate outputs [0xd5,0xd7] + .byte 0xd3 // input [0x9a0000,0x9bffff], candidate outputs [0xd3,0xd4] + .byte 0xd1 // input [0x9c0000,0x9dffff], candidate outputs [0xd0,0xd2] + .byte 0xce // input [0x9e0000,0x9fffff], candidate outputs [0xcd,0xcf] + .byte 0xcc // input [0xa00000,0xa1ffff], candidate outputs [0xcb,0xcc] + .byte 0xc9 // input [0xa20000,0xa3ffff], candidate outputs [0xc8,0xca] + .byte 0xc7 // input [0xa40000,0xa5ffff], candidate outputs [0xc6,0xc7] + .byte 0xc4 // input [0xa60000,0xa7ffff], candidate outputs [0xc4,0xc5] + .byte 0xc2 // input [0xa80000,0xa9ffff], candidate outputs [0xc1,0xc3] + .byte 0xc0 // input [0xaa0000,0xabffff], candidate outputs [0xbf,0xc0] + .byte 0xbd // input [0xac0000,0xadffff], candidate outputs [0xbd,0xbe] + .byte 0xbb // input [0xae0000,0xafffff], candidate outputs [0xbb,0xbc] + .byte 0xb9 // input [0xb00000,0xb1ffff], candidate outputs [0xb9,0xba] + .byte 0xb7 // input [0xb20000,0xb3ffff], candidate outputs [0xb7,0xb8] + .byte 0xb5 // input [0xb40000,0xb5ffff], candidate outputs [0xb5,0xb6] + .byte 0xb3 // input [0xb60000,0xb7ffff], candidate outputs [0xb3,0xb4] + .byte 0xb1 // input [0xb80000,0xb9ffff], candidate outputs [0xb1,0xb2] + .byte 0xaf // input [0xba0000,0xbbffff], candidate outputs [0xaf,0xb0] + .byte 0xad // input [0xbc0000,0xbdffff], candidate outputs [0xad,0xae] + .byte 0xac // input [0xbe0000,0xbfffff], candidate outputs [0xab,0xac] + .byte 0xaa // input [0xc00000,0xc1ffff], candidate outputs [0xa9,0xaa] + .byte 0xa8 // input [0xc20000,0xc3ffff], candidate outputs [0xa8,0xa8] + .byte 0xa6 // input [0xc40000,0xc5ffff], candidate outputs [0xa6,0xa7] + .byte 0xa5 // input [0xc60000,0xc7ffff], candidate outputs [0xa4,0xa5] + .byte 0xa3 // input [0xc80000,0xc9ffff], candidate outputs [0xa3,0xa3] + .byte 0xa1 // input [0xca0000,0xcbffff], candidate outputs [0xa1,0xa2] + .byte 0xa0 // input [0xcc0000,0xcdffff], candidate outputs [0xa0,0xa0] + .byte 0x9e // input [0xce0000,0xcfffff], candidate outputs [0x9e,0x9f] + .byte 0x9d // input [0xd00000,0xd1ffff], candidate outputs [0x9d,0x9d] + .byte 0x9b // input [0xd20000,0xd3ffff], candidate outputs [0x9b,0x9c] + .byte 0x9a // input [0xd40000,0xd5ffff], candidate outputs [0x9a,0x9a] + .byte 0x98 // input [0xd60000,0xd7ffff], candidate outputs [0x98,0x99] + .byte 0x97 // input [0xd80000,0xd9ffff], candidate outputs [0x97,0x97] + .byte 0x96 // input [0xda0000,0xdbffff], candidate outputs [0x95,0x96] + .byte 0x94 // input [0xdc0000,0xddffff], candidate outputs [0x94,0x94] + .byte 0x93 // input [0xde0000,0xdfffff], candidate outputs [0x93,0x93] + .byte 0x92 // input [0xe00000,0xe1ffff], candidate outputs [0x91,0x92] + .byte 0x90 // input [0xe20000,0xe3ffff], candidate outputs [0x90,0x90] + .byte 0x8f // input [0xe40000,0xe5ffff], candidate outputs [0x8f,0x8f] + .byte 0x8e // input [0xe60000,0xe7ffff], candidate outputs [0x8e,0x8e] + .byte 0x8d // input [0xe80000,0xe9ffff], candidate outputs [0x8d,0x8d] + .byte 0x8b // input [0xea0000,0xebffff], candidate outputs [0x8b,0x8c] + .byte 0x8a // input [0xec0000,0xedffff], candidate outputs [0x8a,0x8a] + .byte 0x89 // input [0xee0000,0xefffff], candidate outputs [0x89,0x89] + .byte 0x88 // input [0xf00000,0xf1ffff], candidate outputs [0x88,0x88] + .byte 0x87 // input [0xf20000,0xf3ffff], candidate outputs [0x87,0x87] + .byte 0x86 // input [0xf40000,0xf5ffff], candidate outputs [0x86,0x86] + .byte 0x85 // input [0xf60000,0xf7ffff], candidate outputs [0x85,0x85] + .byte 0x84 // input [0xf80000,0xf9ffff], candidate outputs [0x84,0x84] + .byte 0x83 // input [0xfa0000,0xfbffff], candidate outputs [0x83,0x83] + .byte 0x82 // input [0xfc0000,0xfdffff], candidate outputs [0x82,0x82] + .byte 0x81 // input [0xfe0000,0xffffff], candidate outputs [0x80,0x81] + +END_COMPILERRT_FUNCTION(__aeabi_fdiv) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/lib/builtins/arm/fnan2.c b/compiler-rt/lib/builtins/arm/fnan2.c new file mode 100644 index 0000000000000..06bbd4339f171 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/fnan2.c @@ -0,0 +1,42 @@ +//===-- fnan2.c - Handle single-precision NaN inputs to binary operation --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This helper function is available for use by single-precision float +// arithmetic implementations to handle propagating NaNs from the input +// operands to the output, in a way that matches Arm hardware FP. +// +// On input, a and b are floating-point numbers in IEEE 754 encoding, and at +// least one of them must be a NaN. The return value is the correct output NaN. +// +// A signalling NaN in the input (with bit 22 clear) takes priority over any +// quiet NaN, and is adjusted on return by setting bit 22 to make it quiet. If +// both inputs are the same type of NaN then the first input takes priority: +// the input a is used instead of b. +// +//===----------------------------------------------------------------------===// + +#include + +uint32_t __compiler_rt_fnan2(uint32_t a, uint32_t b) { + // Make shifted-left copies of a and b to discard the sign bit. Then add 1 at + // the bit position where the quiet vs signalling bit ended up. This squashes + // all the signalling NaNs to the top of the range of 32-bit values, from + // 0xff800001 to 0xffffffff inclusive; meanwhile, all the quiet NaN values + // wrap round to the bottom, from 0 to 0x007fffff inclusive. So we can detect + // a signalling NaN by asking if it's greater than 0xff800000, and a quiet + // one by asking if it's less than 0x00800000. + uint32_t aadj = (a << 1) + 0x00800000; + uint32_t badj = (b << 1) + 0x00800000; + if (aadj > 0xff800000) // a is a signalling NaN? + return a | 0x00400000; // if so, return it with the quiet bit set + if (badj > 0xff800000) // b is a signalling NaN? + return b | 0x00400000; // if so, return it with the quiet bit set + if (aadj < 0x00800000) // a is a quiet NaN? + return a; // if so, return it + return b; // otherwise we expect b must be a quiet NaN +} diff --git a/compiler-rt/lib/builtins/arm/fnorm2.c b/compiler-rt/lib/builtins/arm/fnorm2.c new file mode 100644 index 0000000000000..29eba1cbde59d --- /dev/null +++ b/compiler-rt/lib/builtins/arm/fnorm2.c @@ -0,0 +1,62 @@ +//===-- fnorm2.c - Handle single-precision denormal inputs to binary op ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This helper function is available for use by single-precision float +// arithmetic implementations, to handle denormal inputs on entry by +// renormalizing the mantissa and modifying the exponent to match. +// +//===----------------------------------------------------------------------===// + +#include + +// Structure containing the function's inputs and outputs. +// +// On entry: a, b are two input floating-point numbers, still in IEEE 754 +// encoding. expa and expb are the 8-bit exponents of those numbers, extracted +// and shifted down to the low 8 bits of the word, with no other change. +// Neither value should be zero, or have the maximum exponent (indicating an +// infinity or NaN). +// +// On exit: each of a and b contains the mantissa of the input value, with the +// leading 1 bit made explicit, and shifted up to the top of the word. If expa +// was zero (indicating that a was denormal) then it is now represented as a +// normalized number with an out-of-range exponent (zero or negative). The same +// applies to expb and b. +struct fnorm2 { + uint32_t a, b, expa, expb; +}; + +void __compiler_rt_fnorm2(struct fnorm2 *values) { + // Shift the mantissas of a and b to the right place to follow a leading 1 in + // the top bit, if there is one. + values->a <<= 8; + values->b <<= 8; + + // Test if a is denormal. + if (values->expa == 0) { + // If so, decide how much further up to shift its mantissa, and adjust its + // exponent to match. This brings the leading 1 of the denormal mantissa to + // the top of values->a. + uint32_t shift = __builtin_clz(values->a); + values->a <<= shift; + values->expa = 1 - shift; + } else { + // Otherwise, leave the mantissa of a in its current position, and OR in + // the explicit leading 1. + values->a |= 0x80000000; + } + + // Do the same operation on b. + if (values->expb == 0) { + uint32_t shift = __builtin_clz(values->b); + values->b <<= shift; + values->expb = 1 - shift; + } else { + values->b |= 0x80000000; + } +} diff --git a/compiler-rt/lib/builtins/arm/funder.c b/compiler-rt/lib/builtins/arm/funder.c new file mode 100644 index 0000000000000..fd29e157328a3 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/funder.c @@ -0,0 +1,78 @@ +//===-- funder.c - Handle single-precision floating-point underflow -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This helper function is available for use by single-precision float +// arithmetic implementations to handle underflowed output values, if they were +// computed in the form of a normalized mantissa and an out-of-range exponent. +// +// On input: x should be a complete IEEE 754 floating-point value representing +// the desired output scaled up by 2^192 (the same value that would have been +// passed to an underflow trap handler in IEEE 754:1985). +// +// This isn't enough information to re-round to the correct output denormal +// without also knowing whether x itself has already been rounded, and which +// way. 'errsign' gives this information, by indicating the sign of the value +// (true result - x). That is, if errsign > 0 it means the true value was +// larger (x was rounded down); if errsign < 0 then x was rounded up; if +// errsign == 0 then x represents the _exact_ desired output value. +// +//===----------------------------------------------------------------------===// + +#include + +#define SIGNBIT 0x80000000 +#define MANTSIZE 23 +#define BIAS 0xc0 + +uint32_t __compiler_rt_funder(uint32_t x, uint32_t errsign) { + uint32_t sign = x & SIGNBIT; + uint32_t exponent = (x << 1) >> 24; + + // Rule out exponents so small (or large!) that no denormalisation + // is needed. + if (exponent > BIAS) { + // Exponent 0xc1 or above means a normalised number got here by + // mistake, so we just remove the 0xc0 exponent bias and go + // straight home. + return x - (BIAS << MANTSIZE); + } + uint32_t bits_lost = BIAS + 1 - exponent; + if (bits_lost > MANTSIZE + 1) { + // The implicit leading 1 of the intermediate value's mantissa is + // below the lowest mantissa bit of a denormal by at least 2 bits. + // Round down to 0 unconditionally. + return sign; + } + + // Make the full mantissa (with leading bit) at the top of the word. + uint32_t mantissa = 0x80000000 | (x << 8); + // Adjust by 1 depending on the sign of the error. + mantissa -= errsign >> 31; + mantissa += (-errsign) >> 31; + + // Shift down to the output position, keeping the bits shifted off. + uint32_t outmant, shifted_off; + if (bits_lost == MANTSIZE + 1) { + // Special case for the exponent where we have to shift the whole + // of 'mantissa' off the bottom of the word. + outmant = 0; + shifted_off = mantissa; + } else { + outmant = mantissa >> (8 + bits_lost); + shifted_off = mantissa << (32 - (8 + bits_lost)); + } + + // Re-round. + if (shifted_off >> 31) { + outmant++; + if (!(shifted_off << 1)) + outmant &= ~1; // halfway case: round to even + } + + return sign | outmant; +} diff --git a/compiler-rt/lib/builtins/arm/mulsf3.S b/compiler-rt/lib/builtins/arm/mulsf3.S new file mode 100644 index 0000000000000..346d3ed377c9c --- /dev/null +++ b/compiler-rt/lib/builtins/arm/mulsf3.S @@ -0,0 +1,319 @@ +//===-- mulsf3.S - single-precision floating point multiplication ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements single-precision soft-float multiplication with the +// IEEE-754 default rounding (to nearest, ties to even), in optimized AArch32 +// assembly language suitable to be built as either Arm or Thumb2. +// +//===----------------------------------------------------------------------===// + +#include "../assembly.h" + + + .syntax unified + .text + .p2align 2 + +#if __ARM_PCS_VFP +DEFINE_COMPILERRT_FUNCTION(__mulsf3) + push {r4, lr} + vmov r0, s0 + vmov r1, s1 + bl __aeabi_fmul + vmov s0, r0 + pop {r4, pc} +#else +DEFINE_COMPILERRT_FUNCTION_ALIAS(__mulsf3, __aeabi_fmul) +#endif + +DEFINE_COMPILERRT_FUNCTION(__aeabi_fmul) + + // Check if either input exponent is 00 or FF (i.e. not a normalized number), + // and if so, branch out of line. If we don't branch out of line, then we've + // also extracted the exponents of the input values r0/r1 into bits 16..23 of + // r2/r3. But if we do, then that hasn't necessarily been done (because the + // second AND might have been skipped). + mov r12, #0xFF0000 + ands r2, r12, r0, lsr #7 // sets Z if exponent of x is 0 + andsne r3, r12, r1, lsr #7 // otherwise, sets Z if exponent of y is 0 + teqne r2, r12 // otherwise, sets Z if exponent of x is FF + teqne r3, r12 // otherwise, sets Z if exponent of y is FF + beq LOCAL_LABEL(uncommon) // branch out of line to handle inf/NaN/0/denorm + + // Calculate the sign of the result, and put it in an unused bit of r2. + teq r0, r1 // sets N to the XOR of x and y's sign bits + orrmi r2, r2, #0x100 // if N set, set bit 8 of r2 + + // Move the input mantissas to the high end of r0/r1, each with its leading + // bit set explicitly, so that they're in the right form to be multiplied. + mov r12, #0x80000000 + orr r0, r12, r0, lsl #8 + orr r1, r12, r1, lsl #8 + + // Now we're ready to multiply mantissas. This is also the place we'll come + // back to after decoding denormal inputs. The denormal decoding will also + // have to set up the same register contents: + // - decoded fractions at the top of r0 and r1 + // - exponents in r2 and r3, starting at bit 16 + // - output sign in r2 bit 8 +LOCAL_LABEL(mul): + + // Here we multiply the mantissas, and compute the output exponent by adding + // the input exponents and rebiasing. These operations are interleaved to + // use a delay slot. + // + // The exponent is rebiased by subtracting 0x80, rather than the 0x7F you'd + // expect. That compensates for the leading bit of the mantissa overlapping + // it, when we recombine the exponent and mantissa by addition. + add r2, r2, r3 // r2 has sum of exponents, freeing up r3 + umull r1, r3, r0, r1 // r3:r1 has the double-width product + sub r2, r2, #(0x80 << 16) // rebias the summed exponent + + // Compress the double-word product into just the high-order word r3, by + // setting its bit 0 if any bit of the low-order word is nonzero. This + // changes the represented value, but not by nearly enough to affect + // rounding, because rounding only depends on the bit below the last output + // bit, and the general question of whether _any_ nonzero bit exists below + // that. + cmp r1, #0 // if low word of full product is nonzero + orrne r3, r3, #1 // then set LSB of high word + + // The two inputs to UMULL had their high bits set, that is, were at least + // 0x80000000. So the 64-bit product was at least 0x4000000000000000, i.e. + // the high bit of the product could be at the top of the word or one bit + // below. Check which, by experimentally shifting left, and then undoing it + // via RRX if we turned out to have shifted off a 1 bit. + lsls r3, r3, #1 // shift left, setting C to the bit shifted off + rrxcs r3, r3 // if that bit was 1, put it back again + + // That ensured the leading 1 bit of the product is now the top of r3, but + // also, set C if the leading 1 was _already_ in the top bit. So now we know + // whether to increment the exponent. The following instruction does the + // conditional increment (because it's ADC), but also, copies the exponent + // field from bit 16 of r2 into bit 0, so as to place it just below the + // output sign bit. + // + // So, if the number hasn't overflowed or underflowed, the low 9 bits of r2 + // are exactly what we need to combine with the rounded mantissa. But the + // full output exponent (with extra bits) is still available in the high half + // of r2, so that we can check _whether_ we overflowed or underflowed. + adc r2, r2, r2, asr #16 + + // Recombine the exponent and mantissa, doing most of the rounding as a side + // effect: we shift the mantissa right so as to put the round bit into C, and + // then we recombine with the exponent using ADC, to increment the mantissa + // if C was set. + movs r12, r3, lsr #8 + adc r0, r12, r2, lsl #23 + + // To complete the rounding, we must check for the round-to-even tiebreaking + // case, by checking if we're in the exact halfway case, which occurs if and + // only if we _did_ round up (we can tell this because C is still set from + // the MOVS), and also, no bit of r3 is set _below_ the round bit. + // + // We combine this with an overflow check, so that C ends up set if anything + // weird happened, and clear if we're completely finished and can return. + // + // The best instruction sequence for this part varies between Arm and Thumb. +#if !__thumb__ + // Arm state: if C was set then we check the low bits of r3, so that Z ends + // up set if we need to round to even. + // + // (We rely here on Z reliably being clear to begin with, because shifting + // down the output mantissa definitely gave a nonzero output. Also, the TST + // doesn't change C, so if Z does end up set, then C was also set.) + // + // Then, if we're not rounding to even, we do a CMP which sets C if there's + // been an overflow or an underflow. An overflow could occur for an output + // exponent as low as 0xFC, because we might increment the exponent by 1 when + // renormalizing, by another when recombining with the mantissa, and by one + // more if rounding up causes a carry off the top of the mantissa. An + // underflow occurs only if the output exponent is negative (because it's + // offset by 1, so an exponent of 0 will be incremented to 1), in which case + // the top 8 bits of r2 will all be set. Therefore, an unsigned comparison to + // see if r2 > 0xFC0000 will catch all overflow and underflow cases. It also + // catches a few very large cases that _don't_ quite overflow (exponents of + // 0xFC and above that don't get maximally unlucky); those will also be + // handled by the slow path. + tstcs r3, #0x7F + cmpne r2, #0xFC0000 +#else + // In Thumb, switching between different conditions has a higher cost due to + // the (implicit in this code) IT instructions, so we prefer a strategy that + // uses CC and CS conditions throughout, at the cost of requiring some extra + // cleanup instructions on the slow path. + // + // If C is set (and hence round-to-even is a possibility), the basic idea is + // to shift the full result word (r3) left by 25, leaving only its bottom 7 + // bits, which are now the top 7 bits; then we want to set C iff these are 0. + // + // The "CMP x,y" instruction sets C if y > x (as unsigned integers). So this + // could be done in one instruction if only we had a register to use as x, + // which has 0 in the top 7 bits and at least one nonzero. Then we could + // compare that against the shifted-up value of r3, setting C precisely if + // the top 7 bits of y are greater than 0. And happily, we _do_ have such a + // register! r12 contains the shifted-down mantissa, which is guaranteed to + // have a 1 in bit 23, and 0 above that. + // + // The shift of r3 happens only in the second operand of the compare, so we + // don't lose the original value of r3 in this process. + // + // The check for over/underflow is exactly as in the Arm branch above, except + // based on a different condition. + cmpcs r12, r3, lsl #25 // now C is set iff we're rounding to even + cmpcc r2, #0xFC0000 // and now it's also set if we've over/underflowed +#endif + + // That's all the checks for difficult cases done. If C is clear, we can + // return. + bxcc lr + + // Now the slower path begins. We have to recover enough information to + // handle all of round-to-even, overflow and underflow. + // + // Round to even is the most likely of these, so we detect it first and + // handle it as fast as possible. + +#if __thumb__ + // First, Thumb-specific compensation code. The Arm branch of the #if above + // will have set Z=0 to indicate round to even, but the Thumb branch didn't + // leave any unambiguous indicator of RTE, so we must retest by checking all + // the bits shifted off the bottom of the mantissa to see if they're exactly + // the half-way value. + lsl r12, r3, #24 // r12 = round bit and everything below + cmp r12, #0x80000000 // set Z if that is exactly 0x80000000 +#endif + + // Now Z is clear iff we have already rounded up and now must replace that + // with rounding to even, which is done by just clearing the low bit of the + // mantissa. + biceq r0, r0, #1 + + // Redo the over/underflow check (the same way as in both branches above), + // and if it doesn't report a danger, we can return the rounded-to-even + // answer. + cmp r2, #0xFC0000 // check for over/underflow + bxcc lr // and return if none. + + // Now we only have overflow and underflow left to handle. First, find out + // which we're looking at. This is easy by testing the top bit of r2, but + // even easier by using the fact that the possible positive and negative + // values of r2 are widely enough separated that the 0xFC0000 subtracted by + // the CMP above won't have made any difference. So the N flag output from + // that comparison _already_ tells us which condition we have: if N is set we + // have underflow, and if N is clear, overflow. + bpl LOCAL_LABEL(overflow) + + // Here we're handling underflow. + + // Add the IEEE 754:1985 exponent bias which funder will expect. This also + // brings the exponent back into a range where it can't possibly have carried + // into the sign bit, so the output sign will now be right. + add r0, r0, #(0xC0 << 23) + + // Determine whether we rounded up, down or not at all. + lsls r2, r3, #1 // input mantissa, without its leading 1 + subs r1, r2, r0, lsl #9 // subtract the output mantissa (likewise) + + // And let funder handle the rest. + b SYMBOL_NAME(__compiler_rt_funder) + +LOCAL_LABEL(overflow): + // We come here to handle overflow, but it's not guaranteed that an overflow + // has actually happened: our check on the fast path erred on the side of + // caution, by catching any output exponent that _could_ cause an overflow. + // So first check whether this really is an overflow, by extracting the + // output exponent. Exponent 0xFF, or anything that wrapped round to having + // the high bit clear, are overflows; 0xFE down to 0xFC are not overflows. + // + // The value in r0 is correct to return, if there's no overflow. + add r12, r0, #(1 << 23) // add 1 to the exponent so 0xFF wraps to 0 + movs r12, r12, lsl #1 // test the top bit of the modified value + bxmi lr // if top bit is still 1, not an overflow + + // This is an overflow, so we need to replace it with an appropriately signed + // infinity. First we correct the sign by applying a downward bias to the + // exponent (the one suggested in IEEE 754:1985, which was chosen to bring + // all possible overflowed results back into range). + subs r0, r0, #(0xC0 << 23) + + // Now the sign bit of r0 is correct. Replace everything else with the + // encoding of an infinity. + mov r1, #0xFF + and r0, r0, #0x80000000 + orr r0, r0, r1, lsl #23 + bx lr + +LOCAL_LABEL(uncommon): + // Handle zeros, denorms, infinities and NaNs. We arrive here knowing that + // we've at least done the first _two_ instructions from the entry point, + // even if all the rest were skipped. So r2 contains the sign and exponent of + // x in bits 16..23, and r12 = 0xFF << 16. + // + // So, first repeat some instructions from the prologue, which were either + // conditionally skipped in the sequence leading to the branch, or skipped + // because they happened after the branch. + and r3, r12, r1, lsr #7 // get exponent of y in r3 bits 16..23 + teq r0, r1 // calculate the sign of the result + orrmi r2, r2, #0x100 // and put it in bit 8 of r2 as before + + // Check for infinities and NaNs, by testing each of r2,r3 to see if it's at + // least 0xFF0000 (hence the exponent field is equal to 0xFF). + cmp r2, r12 + cmplo r3, r12 + bhs LOCAL_LABEL(inf_NaN) + + // If we didn't take that branch, then we have only finite numbers, but at + // least one is denormal or zero. A zero makes the result easy (and also is a + // more likely input than a denormal), so check those first, as fast as + // possible. + movs r12, r0, lsl #1 // Z set if x == 0 + movsne r12, r1, lsl #1 // now Z set if either input is 0 + moveq r0, r2, lsl #23 // in either case, make 0 of the output sign + bxeq lr // and return it + + // Now we know we only have denormals to deal with. Call fnorm2 to sort + // them out, and rejoin the main code path above. + and r12, r2, #0x100 // save the result sign from r2 + lsr r2, #16 // shift extracted exponents down to bit 0 + lsr r3, #16 // where fnorm2 will expect them + push {r0, r1, r2, r3, r12, lr} + mov r0, sp // tell fnorm2 where to find its data + bl SYMBOL_NAME(__compiler_rt_fnorm2) + pop {r0, r1, r2, r3, r12, lr} + lsl r3, #16 // shift exponents back up to bit 16 + orr r2, r12, r2, lsl #16 // and put the result sign back in r2 + b LOCAL_LABEL(mul) + +LOCAL_LABEL(inf_NaN): + // We come here if at least one input is a NaN or infinity. If either or both + // inputs are NaN then we hand off to fnan2 which will propagate a NaN from + // the input; otherwise any multiplication involving infinity returns + // infinity, unless it's infinity * 0 which is an invalid operation and + // returns NaN again. + mov r12, #0xFF000000 + cmp r12, r0, lsl #1 // if (r0 << 1) > 0xFF000000, r0 is a NaN + blo SYMBOL_NAME(__compiler_rt_fnan2) + cmp r12, r1, lsl #1 + blo SYMBOL_NAME(__compiler_rt_fnan2) + + // NaNs are dealt with, so now we have at least one infinity. Check if the + // other operand is 0. This is conveniently done by XORing the two: because + // we know that the low 31 bits of one operand are exactly 0x7F800000, we can + // test if the low 31 bits of the other one are all 0 by checking whether the + // low 31 bits of (x XOR y) equal 0x7F800000. + eor r3, r0, r1 + cmp r12, r3, lsl #1 // if inf * 0, this sets Z + lsr r0, r12, #1 // set up return value of +infinity + orrne r0, r0, r2, lsl #23 // if not inf * 0, put on the output sign + orreq r0, r0, #0x400000 // otherwise, set the 'quiet NaN' bit + bx lr // and return + +END_COMPILERRT_FUNCTION(__aeabi_fmul) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S b/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S new file mode 100644 index 0000000000000..f2ede1013a9e6 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S @@ -0,0 +1,251 @@ +//===-- mulsf3.S - single-precision floating point multiplication ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements single-precision soft-float multiplication with the +// IEEE-754 default rounding (to nearest, ties to even), in optimized Thumb1 +// assembly language. +// +//===----------------------------------------------------------------------===// + +#include "../../assembly.h" + + .syntax unified + .text + .thumb + .p2align 2 + +DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fmul, __mulsf3) + +DEFINE_COMPILERRT_THUMB_FUNCTION(__mulsf3) + push {r4,r5,r6,lr} + + // Get exponents of the inputs, and check for uncommon values. In the process + // of this we also compute the sign, because it's marginally quicker that + // way. + lsls r2, r0, #1 + adcs r4, r4, r4 // set r4[0] to sign bit of x + lsls r3, r1, #1 + adcs r4, r4, r3 // set r4[0] to the output sign + lsrs r2, r2, #24 + beq LOCAL_LABEL(zerodenorm0) // still do the next LSRS + lsrs r3, r3, #24 + beq LOCAL_LABEL(zerodenorm) + cmp r2, #255 + beq LOCAL_LABEL(naninf) + cmp r3, #255 + beq LOCAL_LABEL(naninf) + // Compute the output exponent. We'll be generating our product _without_ the + // leading bit, so we subtract 0x7f rather than 0x80. + adds r2, r2, r3 + subs r2, r2, #0x7f + // Blank off everything above the mantissas. + lsls r0, r0, #9 + lsls r1, r1, #9 +LOCAL_LABEL(normalised): // we may come back here from zerodenorm + lsrs r0, r0, #9 + lsrs r1, r1, #9 + // Multiply. r0 and r1 are the mantissas of the inputs but without their + // leading bits, so the product we want in principle is P=(r0+2^23)(r1+2^23). + // P is at most (2^24-1)^2 < 2^48, so it fits in a word and a half. + // + // The technique below will actually compute P - 2^46, by not adding on the + // term where the two 2^23 are multiplied. The 48-bit result will be + // delivered in two output registers, one containing its bottom 32 bits and + // the other containing the top 32, so they overlap in the middle 16 bits. + // This is done using only two multiply instructions and some bookkeeping. + // + // In the comments I'll write X and Y for the original input mantissas (again + // without their leading bits). I'll also decompose them as X = xh + xl and + // Y = yh + yl, where xl and yl are in the range 0..2^8-1 and xh,yh are + // multiples of 2^8. + adds r5, r0, r1 + lsls r5, r5, #7 // r5 = (X+Y) << 7 + movs r6, r0 + muls r6, r1, r6 // r6 is congruent mod 2^32 to X*Y + lsrs r0, r0, #8 + lsrs r1, r1, #8 + muls r0, r1, r0 + lsls r1, r0, #16 // r1 is congruent mod 2^32 to xh*yh + subs r3, r6, r1 // now r3 is congruent mod 2^32 to + // (X*Y) - (xh*yh) = xh*yl + xl*yh + xl*yl + // and hence, since that is at most 0xfeff0001, + // is _exactly_ equal to that + adds r0, r0, r5 // r0 is now (xh*yh + (X+Y)<<23) >> 16 + lsrs r1, r3, #16 // r1 is the top 16 bits of r3, i.e. + // (xh*yl + xl*yh + xl*yl) >> 16 + adds r3, r0, r1 // now r3 equals + // (xh*yh + xh*yl + xl*yh + xl*yl + (X+Y)<<23) >> 16 + // i.e. (X*Y + (X+Y)<<23) >> 16, + // i.e. (the right answer) >> 16. + // Meanwhile, r6 is exactly the bottom 32 bits of the + // right answer. + // Renormalise if necessary. + lsrs r1, r3, #30 + beq LOCAL_LABEL(norenorm) + // Here we have to do something fiddly. Renormalisation would be a trivial + // job if we had the leading mantissa bit - just note that it's one bit + // position above where it should be, and shift right by one. But without + // that bit, we currently have (2x - 2^30), and we want (x - 2^30); just + // shifting right would of course give us (x - 2^29), so we must subtract an + // extra 2^29 to fix this up. + lsrs r3, r3, #1 + movs r1, #1 + lsls r1, r1, #29 + subs r3, r3, r1 + adds r2, r2, #1 +LOCAL_LABEL(norenorm): + // Round and shift down to the right bit position. + lsrs r0, r3, #7 // round bit goes into the carry flag + bcc LOCAL_LABEL(rounded) + adds r0, r0, #1 + // In the round-up branch, we must also check if we have to round to even, by + // testing all the bits below the round bit. We will normally not expect to, + // so we do RTE by branching out of line and back again to avoid spending a + // branch in the common case. + lsls r5, r3, #32-7+1 // check the bits shifted out of r3 above + bne LOCAL_LABEL(rounded) // if any is nonzero, we're not rounding to even + lsls r5, r6, #15 // check the bottom 17 bits of the low-order 32 + // (enough to overlap r3 even if we renormalised) + beq LOCAL_LABEL(rte) // if any is nonzero, fall through, else RTE +LOCAL_LABEL(rounded): + // Put on the sign and exponent, check for underflow and overflow, and + // return. + // + // Underflow occurs iff r2 (the output exponent) <= 0. Overflow occurs if + // it's >= 0xFF. (Also if it's 0xFE and we rounded up to overflow, but since + // this code doesn't report exceptions, we can ignore this case because it'll + // happen to return the right answer regardless). So we handle most of this + // via an unsigned comparison against 0xFF, which leaves the one case of a + // zero exponent that we have to filter separately by testing the Z flag + // after we shift the exponent back up into place. + cmp r2, #0xFF // check for most over/underflows + bhs LOCAL_LABEL(outflow) // ... and branch out of line for them + lsls r5, r2, #23 // shift the exponent into its output location + beq LOCAL_LABEL(outflow) // ... and branch again if it was 0 + lsls r4, r4, #31 // shift the output sign into place + orrs r0, r0, r4 // and OR it in to the output + adds r0, r0, r5 // OR in the mantissa + pop {r4,r5,r6,pc} // and return + +LOCAL_LABEL(rte): + // Out-of-line handler for the round-to-even case. Clear the low mantissa bit + // and go back to the post-rounding code. + movs r5, #1 + bics r0, r0, r5 + b LOCAL_LABEL(rounded) + +LOCAL_LABEL(outflow): + cmp r2, #0 + bgt LOCAL_LABEL(overflow) + // To handle underflow, we construct an intermediate value in the IEEE 754 + // style (using our existing full-length mantissa, and bias the exponent by + // +0xC0), and indicate whether that intermediate was rounded up, down or not + // at all. Then call the helper function funder, which will denormalise and + // re-round correctly. + lsls r1, r0, #7 // shift up the post-rounding mantissa + subs r1, r3, r1 // and subtract it from the pre-rounding version + lsls r6, r6, #15 + cmp r6, #1 // if the rest of the low bits are nonzero + adcs r1, r1, r1 // then set an extra bit at the bottom + + lsls r4, r4, #31 + orrs r0, r0, r4 // put on the sign + adds r2, r2, #192 // bias the exponent + lsls r3, r2, #23 + adds r0, r0, r3 // put on the biased exponent + + bl SYMBOL_NAME(__compiler_rt_funder) + pop {r4,r5,r6,pc} + +LOCAL_LABEL(overflow): + // Handle overflow by returning an infinity of the correct sign. + lsls r4, r4, #8 // move the sign up to bit 8 + movs r0, #0xff + orrs r0, r0, r4 // fill in an exponent just below it + lsls r0, r0, #23 // and shift those 9 bits up to the top of the word + pop {r4,r5,r6,pc} + + // We come here if there's at least one zero or denormal. On the fast path + // above, it was convenient to check these before checking NaNs and + // infinities, but NaNs take precedence, so now we're off the fast path, we + // must still check for those. + // + // At the main entry point 'zerodenorm' we want r2 and r3 to be the two input + // exponents. So if we branched after shifting-and-checking r2, we come to + // this earlier entry point 'zerodenorm0' so that we still shift r3. +LOCAL_LABEL(zerodenorm0): + lsrs r3, r3, #24 +LOCAL_LABEL(zerodenorm): + cmp r2, #255 + beq LOCAL_LABEL(naninf) + cmp r3, #255 + beq LOCAL_LABEL(naninf) + // Now we know we have at least one zero or denormal, and no NaN or infinity. + // Check if either input is actually zero. We've ruled out 0 * infinity by + // this point, so any zero input means we return zero of the correct sign. + lsls r6, r0, #1 // is one input zero? + beq LOCAL_LABEL(zero) // yes, go and return zero + lsls r6, r1, #1 // is the other one zero? + bne LOCAL_LABEL(denorm) // if not, one must have been a denormal +LOCAL_LABEL(zero): + lsls r0, r4, #31 // shift up the output sign to make the return value + pop {r4,r5,r6,pc} + + // Handle denormals via the helper function fnorm2, which will break both + // inputs up into mantissa and exponent, renormalising and generating a + // negative exponent if necessary. +LOCAL_LABEL(denorm): + push {r0,r1,r2,r3} + mov r0, sp + bl SYMBOL_NAME(__compiler_rt_fnorm2) + pop {r0,r1,r2,r3} + // Convert fnorm2's return values into the right form to rejoin the main + // code path. + lsls r0, r0, #1 + lsls r1, r1, #1 + adds r2, r2, r3 + subs r2, r2, #0x7f + b LOCAL_LABEL(normalised) + + // We come here if at least one input is a NaN or infinity. There may still + // be zeroes (or denormals, though they make no difference at this stage). +LOCAL_LABEL(naninf): + movs r6, #0xff + lsls r6, r6, #24 + lsls r5, r0, #1 + cmp r5, r6 + bhi LOCAL_LABEL(nan) // first operand is a NaN + lsls r5, r1, #1 + cmp r5, r6 + bhi LOCAL_LABEL(nan) // second operand is a NaN + + // We know we have at least one infinity, and no NaNs. We might also have a + // zero, in which case we return the default quiet NaN. + lsls r6, r0, #1 + beq LOCAL_LABEL(infzero) // if r0 is a zero, r1 must be inf + lsls r6, r1, #1 + beq LOCAL_LABEL(infzero) // if r1 is a zero, r0 must be inf + // Otherwise we have infinity * infinity, or infinity * finite. Just return + // an appropriately signed infinity. + b LOCAL_LABEL(overflow) // reuse the code there + + // We come here if at least one input is a NaN. Hand off to fnan2, which + // propagates an appropriate NaN to the output, dealing with the special + // cases of signalling/quiet NaNs. +LOCAL_LABEL(nan): + bl SYMBOL_NAME(__compiler_rt_fnan2) + pop {r4,r5,r6,pc} + + // Return a quiet NaN as the result of infinity * zero. +LOCAL_LABEL(infzero): + ldr r0, =0x7fc00000 + pop {r4,r5,r6,pc} + +END_COMPILERRT_FUNCTION(__mulsf3) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/test/builtins/CMakeLists.txt b/compiler-rt/test/builtins/CMakeLists.txt index 63f4c94605c90..8e3cb35183ba7 100644 --- a/compiler-rt/test/builtins/CMakeLists.txt +++ b/compiler-rt/test/builtins/CMakeLists.txt @@ -35,6 +35,10 @@ if(APPLE) darwin_filter_host_archs(BUILTIN_SUPPORTED_ARCH BUILTIN_TEST_ARCH) endif() +if(COMPILER_RT_ARM_OPTIMIZED_FP) + list(APPEND BUILTINS_TEST_TARGET_CFLAGS -DCOMPILER_RT_ARM_OPTIMIZED_FP) +endif() + foreach(arch ${BUILTIN_TEST_ARCH}) set(BUILTINS_TEST_TARGET_ARCH ${arch}) string(TOLOWER "-${arch}-${OS_NAME}" BUILTINS_TEST_CONFIG_SUFFIX) diff --git a/compiler-rt/test/builtins/Unit/divsf3_test.c b/compiler-rt/test/builtins/Unit/divsf3_test.c index f8cb6169ac283..12c5df5fdaae1 100644 --- a/compiler-rt/test/builtins/Unit/divsf3_test.c +++ b/compiler-rt/test/builtins/Unit/divsf3_test.c @@ -1,115 +1,428 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + // RUN: %clang_builtins %s %librt -o %t && %run %t // REQUIRES: librt_has_divsf3 #include "int_lib.h" +#include #include #include "fp_test.h" +// By default this test uses compareResultF to check the returned floats, which +// accepts any returned NaN if the expected result is the canonical NaN value +// 0x7fc00000. For the Arm optimized FP implementation, which commits to a more +// detailed handling of NaNs, we tighten up the check and include some extra +// test cases specific to that NaN policy. +#if (__arm__ && !(__thumb__ && !__thumb2__)) && COMPILER_RT_ARM_OPTIMIZED_FP +# define EXPECT_EXACT_RESULTS +# define ARM_NAN_HANDLING +#endif + // Returns: a / b COMPILER_RT_ABI float __divsf3(float a, float b); -int test__divsf3(float a, float b, uint32_t expected) -{ - float x = __divsf3(a, b); - int ret = compareResultF(x, expected); +int test__divsf3(uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) { + float a = fromRep32(a_rep), b = fromRep32(b_rep); + float x = __divsf3(a, b); +#ifdef EXPECT_EXACT_RESULTS + int ret = toRep32(x) == expected_rep; +#else + int ret = compareResultF(x, expected_rep); +#endif - if (ret){ - printf("error in test__divsf3(%.20e, %.20e) = %.20e, " - "expected %.20e\n", a, b, x, - fromRep32(expected)); - } - return ret; + if (ret) { + printf("error in test__divsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32 + ", expected %08" PRIx32 "\n", + a_rep, b_rep, toRep32(x), expected_rep); + } + return ret; } -int main() -{ - // Returned NaNs are assumed to be qNaN by default - - // qNaN / any = qNaN - if (test__divsf3(makeQNaN32(), 3.F, UINT32_C(0x7fc00000))) - return 1; - // NaN / any = NaN - if (test__divsf3(makeNaN32(UINT32_C(0x123)), 3.F, UINT32_C(0x7fc00000))) - return 1; - // any / qNaN = qNaN - if (test__divsf3(3.F, makeQNaN32(), UINT32_C(0x7fc00000))) - return 1; - // any / NaN = NaN - if (test__divsf3(3.F, makeNaN32(UINT32_C(0x123)), UINT32_C(0x7fc00000))) - return 1; - - // +Inf / positive = +Inf - if (test__divsf3(makeInf32(), 3.F, UINT32_C(0x7f800000))) - return 1; - // +Inf / negative = -Inf - if (test__divsf3(makeInf32(), -3.F, UINT32_C(0xff800000))) - return 1; - // -Inf / positive = -Inf - if (test__divsf3(makeNegativeInf32(), 3.F, UINT32_C(0xff800000))) - return 1; - // -Inf / negative = +Inf - if (test__divsf3(makeNegativeInf32(), -3.F, UINT32_C(0x7f800000))) - return 1; - - // Inf / Inf = NaN - if (test__divsf3(makeInf32(), makeInf32(), UINT32_C(0x7fc00000))) - return 1; - // 0.0 / 0.0 = NaN - if (test__divsf3(+0x0.0p+0F, +0x0.0p+0F, UINT32_C(0x7fc00000))) - return 1; - // +0.0 / +Inf = +0.0 - if (test__divsf3(+0x0.0p+0F, makeInf32(), UINT32_C(0x0))) - return 1; - // +Inf / +0.0 = +Inf - if (test__divsf3(makeInf32(), +0x0.0p+0F, UINT32_C(0x7f800000))) - return 1; - - // positive / +0.0 = +Inf - if (test__divsf3(+1.F, +0x0.0p+0F, UINT32_C(0x7f800000))) - return 1; - // positive / -0.0 = -Inf - if (test__divsf3(+1.F, -0x0.0p+0F, UINT32_C(0xff800000))) - return 1; - // negative / +0.0 = -Inf - if (test__divsf3(-1.F, +0x0.0p+0F, UINT32_C(0xff800000))) - return 1; - // negative / -0.0 = +Inf - if (test__divsf3(-1.F, -0x0.0p+0F, UINT32_C(0x7f800000))) - return 1; - - // 1/3 - if (test__divsf3(1.F, 3.F, UINT32_C(0x3eaaaaab))) - return 1; - // smallest normal result - if (test__divsf3(0x1.0p-125F, 2.F, UINT32_C(0x00800000))) - return 1; +int main(void) { + int status = 0; - // divisor is exactly 1.0 - if (test__divsf3(0x1.0p+0F, 0x1.0p+0F, UINT32_C(0x3f800000))) - return 1; - // divisor is truncated to exactly 1.0 in UQ1.15 - if (test__divsf3(0x1.0p+0F, 0x1.0001p+0F, UINT32_C(0x3f7fff00))) - return 1; + status |= test__divsf3(0x00000000, 0x00000001, 0x00000000); + status |= test__divsf3(0x00000000, 0x007fffff, 0x00000000); + status |= test__divsf3(0x00000000, 0x00800000, 0x00000000); + status |= test__divsf3(0x00000000, 0x00ffffff, 0x00000000); + status |= test__divsf3(0x00000000, 0x3f800000, 0x00000000); + status |= test__divsf3(0x00000000, 0x40a00000, 0x00000000); + status |= test__divsf3(0x00000000, 0x7effffff, 0x00000000); + status |= test__divsf3(0x00000000, 0x7f000000, 0x00000000); + status |= test__divsf3(0x00000000, 0x7f800000, 0x00000000); + status |= test__divsf3(0x00000000, 0x80000002, 0x80000000); + status |= test__divsf3(0x00000000, 0x807fffff, 0x80000000); + status |= test__divsf3(0x00000000, 0x80800001, 0x80000000); + status |= test__divsf3(0x00000000, 0x81000000, 0x80000000); + status |= test__divsf3(0x00000000, 0xc0400000, 0x80000000); + status |= test__divsf3(0x00000000, 0xc0e00000, 0x80000000); + status |= test__divsf3(0x00000000, 0xfe7fffff, 0x80000000); + status |= test__divsf3(0x00000000, 0xff000000, 0x80000000); + status |= test__divsf3(0x00000000, 0xff800000, 0x80000000); + status |= test__divsf3(0x00000001, 0x00000000, 0x7f800000); + status |= test__divsf3(0x00000001, 0x3e000000, 0x00000008); + status |= test__divsf3(0x00000001, 0x3f000000, 0x00000002); + status |= test__divsf3(0x00000001, 0x40000000, 0x00000000); + status |= test__divsf3(0x00000001, 0x7f7fffff, 0x00000000); + status |= test__divsf3(0x00000001, 0x7f800000, 0x00000000); + status |= test__divsf3(0x00000001, 0xc0000000, 0x80000000); + status |= test__divsf3(0x00000001, 0xff7fffff, 0x80000000); + status |= test__divsf3(0x00000002, 0x80000000, 0xff800000); + status |= test__divsf3(0x00000002, 0xff800000, 0x80000000); + status |= test__divsf3(0x00000009, 0x41100000, 0x00000001); + status |= test__divsf3(0x00000009, 0xc1100000, 0x80000001); + status |= test__divsf3(0x007ffff7, 0x3f7ffffe, 0x007ffff8); + status |= test__divsf3(0x007ffffe, 0x3f7ffffe, 0x007fffff); + status |= test__divsf3(0x007fffff, 0x00000000, 0x7f800000); + status |= test__divsf3(0x007fffff, 0x3b000000, 0x04fffffe); + status |= test__divsf3(0x007fffff, 0x3f000000, 0x00fffffe); + status |= test__divsf3(0x007fffff, 0x3f800000, 0x007fffff); + status |= test__divsf3(0x007fffff, 0x3f800002, 0x007ffffd); + status |= test__divsf3(0x007fffff, 0x7f800000, 0x00000000); + status |= test__divsf3(0x007fffff, 0x80000000, 0xff800000); + status |= test__divsf3(0x007fffff, 0xbf800000, 0x807fffff); + status |= test__divsf3(0x007fffff, 0xff800000, 0x80000000); + status |= test__divsf3(0x00800000, 0x00000000, 0x7f800000); + status |= test__divsf3(0x00800000, 0x3f800001, 0x007fffff); + status |= test__divsf3(0x00800000, 0x7f800000, 0x00000000); + status |= test__divsf3(0x00800001, 0x3f800002, 0x007fffff); + status |= test__divsf3(0x00800001, 0x80000000, 0xff800000); + status |= test__divsf3(0x00800001, 0xff800000, 0x80000000); + status |= test__divsf3(0x00800002, 0x3f800006, 0x007ffffc); + status |= test__divsf3(0x00fffffe, 0x40000000, 0x007fffff); + status |= test__divsf3(0x00ffffff, 0x00000000, 0x7f800000); + status |= test__divsf3(0x00ffffff, 0x40000000, 0x00800000); + status |= test__divsf3(0x00ffffff, 0x7f800000, 0x00000000); + status |= test__divsf3(0x01000000, 0x00800000, 0x40000000); + status |= test__divsf3(0x01000000, 0x80000000, 0xff800000); + status |= test__divsf3(0x01000000, 0xc0000000, 0x80800000); + status |= test__divsf3(0x01000000, 0xff800000, 0x80000000); + status |= test__divsf3(0x01000001, 0x00800001, 0x40000000); + status |= test__divsf3(0x01000001, 0xc0000000, 0x80800001); + status |= test__divsf3(0x01000003, 0x80800003, 0xc0000000); + status |= test__divsf3(0x01000003, 0xc0000000, 0x80800003); + status |= test__divsf3(0x3f7ffff7, 0x3f7ffffb, 0x3f7ffffc); + status |= test__divsf3(0x3f7ffff7, 0x3f7ffffe, 0x3f7ffff9); + status |= test__divsf3(0x3f7ffff8, 0x3f7ffffc, 0x3f7ffffc); + status |= test__divsf3(0x3f7ffff8, 0x3f7ffffd, 0x3f7ffffb); + status |= test__divsf3(0x3f7ffffa, 0x3f7ffff9, 0x3f800001); + status |= test__divsf3(0x3f7ffffb, 0x3f7ffff9, 0x3f800001); + status |= test__divsf3(0x3f7ffffc, 0x3f7ffff9, 0x3f800002); + status |= test__divsf3(0x3f7ffffc, 0x3f7ffffd, 0x3f7fffff); + status |= test__divsf3(0x3f7ffffc, 0x3f7ffffe, 0x3f7ffffe); + status |= test__divsf3(0x3f7ffffc, 0x3f7fffff, 0x3f7ffffd); + status |= test__divsf3(0x3f7ffffc, 0x3f800001, 0x3f7ffffa); + status |= test__divsf3(0x3f7ffffd, 0x3f7ffff9, 0x3f800002); + status |= test__divsf3(0x3f7ffffd, 0x3f7ffffc, 0x3f800001); + status |= test__divsf3(0x3f7ffffd, 0x3f7ffffe, 0x3f7fffff); + status |= test__divsf3(0x3f7ffffd, 0x3f7fffff, 0x3f7ffffe); + status |= test__divsf3(0x3f7ffffd, 0x3f800001, 0x3f7ffffb); + status |= test__divsf3(0x3f7ffffd, 0x3f800002, 0x3f7ffff9); + status |= test__divsf3(0x3f7ffffe, 0x3f7ffff9, 0x3f800003); + status |= test__divsf3(0x3f7ffffe, 0x3f7ffffc, 0x3f800001); + status |= test__divsf3(0x3f7ffffe, 0x3f7ffffd, 0x3f800001); + status |= test__divsf3(0x3f7ffffe, 0x3f7fffff, 0x3f7fffff); + status |= test__divsf3(0x3f7ffffe, 0x3f800001, 0x3f7ffffc); + status |= test__divsf3(0x3f7ffffe, 0x3f800002, 0x3f7ffffa); + status |= test__divsf3(0x3f7ffffe, 0x3f800003, 0x3f7ffff8); + status |= test__divsf3(0x3f7fffff, 0x3f7ffff9, 0x3f800003); + status |= test__divsf3(0x3f7fffff, 0x3f7ffffc, 0x3f800002); + status |= test__divsf3(0x3f7fffff, 0x3f7ffffd, 0x3f800001); + status |= test__divsf3(0x3f7fffff, 0x3f7ffffe, 0x3f800001); + status |= test__divsf3(0x3f7fffff, 0x3f800001, 0x3f7ffffd); + status |= test__divsf3(0x3f7fffff, 0x3f800002, 0x3f7ffffb); + status |= test__divsf3(0x3f7fffff, 0x3f800003, 0x3f7ffff9); + status |= test__divsf3(0x3f7fffff, 0x3f800004, 0x3f7ffff7); + status |= test__divsf3(0x3f800000, 0x00000000, 0x7f800000); + status |= test__divsf3(0x3f800000, 0x3f7ffff7, 0x3f800005); + status |= test__divsf3(0x3f800000, 0x3f7ffff8, 0x3f800004); + status |= test__divsf3(0x3f800000, 0x3f7ffffb, 0x3f800003); + status |= test__divsf3(0x3f800000, 0x3f7ffffc, 0x3f800002); + status |= test__divsf3(0x3f800000, 0x3f7ffffd, 0x3f800002); + status |= test__divsf3(0x3f800000, 0x3f7ffffe, 0x3f800001); + status |= test__divsf3(0x3f800000, 0x3f7fffff, 0x3f800001); + status |= test__divsf3(0x3f800000, 0x3f800000, 0x3f800000); + status |= test__divsf3(0x3f800000, 0x3f800001, 0x3f7ffffe); + status |= test__divsf3(0x3f800000, 0x3f800002, 0x3f7ffffc); + status |= test__divsf3(0x3f800000, 0x3f800003, 0x3f7ffffa); + status |= test__divsf3(0x3f800000, 0x3f800004, 0x3f7ffff8); + status |= test__divsf3(0x3f800000, 0x7f800000, 0x00000000); + status |= test__divsf3(0x3f800001, 0x3f7ffffb, 0x3f800004); + status |= test__divsf3(0x3f800001, 0x3f7ffffd, 0x3f800003); + status |= test__divsf3(0x3f800001, 0x3f7ffffe, 0x3f800002); + status |= test__divsf3(0x3f800001, 0x3f7fffff, 0x3f800002); + status |= test__divsf3(0x3f800001, 0x3f800002, 0x3f7ffffe); + status |= test__divsf3(0x3f800001, 0x3f800003, 0x3f7ffffc); + status |= test__divsf3(0x3f800002, 0x3f7ffffc, 0x3f800004); + status |= test__divsf3(0x3f800002, 0x3f7ffffd, 0x3f800004); + status |= test__divsf3(0x3f800002, 0x3f7ffffe, 0x3f800003); + status |= test__divsf3(0x3f800002, 0x3f7fffff, 0x3f800003); + status |= test__divsf3(0x3f800002, 0x3f800001, 0x3f800001); + status |= test__divsf3(0x3f800002, 0x3f800003, 0x3f7ffffe); + status |= test__divsf3(0x3f800003, 0x3f7ffffd, 0x3f800005); + status |= test__divsf3(0x3f800003, 0x3f7ffffe, 0x3f800004); + status |= test__divsf3(0x3f800003, 0x3f7fffff, 0x3f800004); + status |= test__divsf3(0x3f800003, 0x3f800001, 0x3f800002); + status |= test__divsf3(0x3f800004, 0x3f7ffffe, 0x3f800005); + status |= test__divsf3(0x3f800004, 0x3f800001, 0x3f800003); + status |= test__divsf3(0x3f800004, 0x3f800007, 0x3f7ffffa); + status |= test__divsf3(0x3f800005, 0x3f7fffff, 0x3f800006); + status |= test__divsf3(0x3f800006, 0x3f800008, 0x3f7ffffc); + status |= test__divsf3(0x3f800007, 0x3f800002, 0x3f800005); + status |= test__divsf3(0x3f800009, 0x3f800008, 0x3f800001); + status |= test__divsf3(0x40000000, 0x3f800000, 0x40000000); + status |= test__divsf3(0x40000000, 0xbf800000, 0xc0000000); + status |= test__divsf3(0x40400000, 0x80000000, 0xff800000); + status |= test__divsf3(0x40400000, 0xc0400000, 0xbf800000); + status |= test__divsf3(0x40400000, 0xff800000, 0x80000000); + status |= test__divsf3(0x40a00000, 0x00000000, 0x7f800000); + status |= test__divsf3(0x40a00000, 0x40a00000, 0x3f800000); + status |= test__divsf3(0x40a00000, 0x7f800000, 0x00000000); + status |= test__divsf3(0x40e00000, 0x80000000, 0xff800000); + status |= test__divsf3(0x40e00000, 0xff800000, 0x80000000); + status |= test__divsf3(0x41000000, 0x40000000, 0x40800000); + status |= test__divsf3(0x41100000, 0x40400000, 0x40400000); + status |= test__divsf3(0x7b000000, 0x05000000, 0x7f800000); + status |= test__divsf3(0x7e7fffff, 0x80000000, 0xff800000); + status |= test__divsf3(0x7efffffd, 0xc0000000, 0xfe7ffffd); + status |= test__divsf3(0x7effffff, 0x00000000, 0x7f800000); + status |= test__divsf3(0x7effffff, 0x7f800000, 0x00000000); + status |= test__divsf3(0x7f000000, 0x00000000, 0x7f800000); + status |= test__divsf3(0x7f000000, 0x007fffff, 0x7f800000); + status |= test__divsf3(0x7f000000, 0x3f000000, 0x7f800000); + status |= test__divsf3(0x7f000000, 0x40000000, 0x7e800000); + status |= test__divsf3(0x7f000000, 0x7f800000, 0x00000000); + status |= test__divsf3(0x7f000000, 0x80000000, 0xff800000); + status |= test__divsf3(0x7f000000, 0xbf000000, 0xff800000); + status |= test__divsf3(0x7f000000, 0xc0000000, 0xfe800000); + status |= test__divsf3(0x7f000000, 0xff800000, 0x80000000); + status |= test__divsf3(0x7f000003, 0xfe800003, 0xc0000000); + status |= test__divsf3(0x7f7ffffd, 0x40800000, 0x7e7ffffd); + status |= test__divsf3(0x7f7ffffd, 0xc0800000, 0xfe7ffffd); + status |= test__divsf3(0x7f7fffff, 0x00000001, 0x7f800000); + status |= test__divsf3(0x7f7fffff, 0x3f7fffff, 0x7f800000); + status |= test__divsf3(0x7f7fffff, 0x7e7fffff, 0x40800000); + status |= test__divsf3(0x7f7fffff, 0x7effffff, 0x40000000); + status |= test__divsf3(0x7f7fffff, 0xc0000000, 0xfeffffff); + status |= test__divsf3(0x7f7fffff, 0xfe7fffff, 0xc0800000); + status |= test__divsf3(0x7f7fffff, 0xff800000, 0x80000000); + status |= test__divsf3(0x7f800000, 0x00000000, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x00000001, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x007fffff, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x00800000, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x00ffffff, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x3f800000, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x40a00000, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x7effffff, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x7f000000, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x80000000, 0xff800000); + status |= test__divsf3(0x7f800000, 0x80000002, 0xff800000); + status |= test__divsf3(0x7f800000, 0x807fffff, 0xff800000); + status |= test__divsf3(0x7f800000, 0x80800001, 0xff800000); + status |= test__divsf3(0x7f800000, 0x81000000, 0xff800000); + status |= test__divsf3(0x7f800000, 0xc0400000, 0xff800000); + status |= test__divsf3(0x7f800000, 0xc0e00000, 0xff800000); + status |= test__divsf3(0x7f800000, 0xfe7fffff, 0xff800000); + status |= test__divsf3(0x7f800000, 0xff000000, 0xff800000); + status |= test__divsf3(0x7f800000, 0xff7fffff, 0xff800000); + status |= test__divsf3(0x80000000, 0x00000003, 0x80000000); + status |= test__divsf3(0x80000000, 0x007fffff, 0x80000000); + status |= test__divsf3(0x80000000, 0x00800001, 0x80000000); + status |= test__divsf3(0x80000000, 0x01000000, 0x80000000); + status |= test__divsf3(0x80000000, 0x40000000, 0x80000000); + status |= test__divsf3(0x80000000, 0x40c00000, 0x80000000); + status |= test__divsf3(0x80000000, 0x7e7fffff, 0x80000000); + status |= test__divsf3(0x80000000, 0x7e800000, 0x80000000); + status |= test__divsf3(0x80000000, 0x7f800000, 0x80000000); + status |= test__divsf3(0x80000000, 0x80000004, 0x00000000); + status |= test__divsf3(0x80000000, 0x807fffff, 0x00000000); + status |= test__divsf3(0x80000000, 0x80800000, 0x00000000); + status |= test__divsf3(0x80000000, 0x80ffffff, 0x00000000); + status |= test__divsf3(0x80000000, 0xc0800000, 0x00000000); + status |= test__divsf3(0x80000000, 0xc1000000, 0x00000000); + status |= test__divsf3(0x80000000, 0xfe800000, 0x00000000); + status |= test__divsf3(0x80000000, 0xfeffffff, 0x00000000); + status |= test__divsf3(0x80000000, 0xff800000, 0x00000000); + status |= test__divsf3(0x80000001, 0x3f000000, 0x80000002); + status |= test__divsf3(0x80000001, 0x40000000, 0x80000000); + status |= test__divsf3(0x80000001, 0x7f7fffff, 0x80000000); + status |= test__divsf3(0x80000001, 0xc0000000, 0x00000000); + status |= test__divsf3(0x80000001, 0xff7fffff, 0x00000000); + status |= test__divsf3(0x80000003, 0x00000000, 0xff800000); + status |= test__divsf3(0x80000003, 0x7f800000, 0x80000000); + status |= test__divsf3(0x80000004, 0x80000000, 0x7f800000); + status |= test__divsf3(0x80000004, 0xff800000, 0x00000000); + status |= test__divsf3(0x807ffff8, 0x3f7ffffe, 0x807ffff9); + status |= test__divsf3(0x807fffff, 0x00000000, 0xff800000); + status |= test__divsf3(0x807fffff, 0x7f800000, 0x80000000); + status |= test__divsf3(0x807fffff, 0x80000000, 0x7f800000); + status |= test__divsf3(0x807fffff, 0xff800000, 0x00000000); + status |= test__divsf3(0x80800000, 0x3f800001, 0x807fffff); + status |= test__divsf3(0x80800000, 0x80000000, 0x7f800000); + status |= test__divsf3(0x80800000, 0xff800000, 0x00000000); + status |= test__divsf3(0x80800001, 0x00000000, 0xff800000); + status |= test__divsf3(0x80800001, 0x7f800000, 0x80000000); + status |= test__divsf3(0x80ffffff, 0x80000000, 0x7f800000); + status |= test__divsf3(0x80ffffff, 0xff800000, 0x00000000); + status |= test__divsf3(0x81000000, 0x00000000, 0xff800000); + status |= test__divsf3(0x81000000, 0x7f800000, 0x80000000); + status |= test__divsf3(0x81000001, 0x00800001, 0xc0000000); + status |= test__divsf3(0x81000005, 0x00800005, 0xc0000000); + status |= test__divsf3(0xbf800000, 0x3f800000, 0xbf800000); + status |= test__divsf3(0xbf800000, 0xbf800000, 0x3f800000); + status |= test__divsf3(0xc0000000, 0x00000000, 0xff800000); + status |= test__divsf3(0xc0000000, 0x3f800000, 0xc0000000); + status |= test__divsf3(0xc0000000, 0x7f800000, 0x80000000); + status |= test__divsf3(0xc0000000, 0xbf800000, 0x40000000); + status |= test__divsf3(0xc0800000, 0x80000000, 0x7f800000); + status |= test__divsf3(0xc0800000, 0xff800000, 0x00000000); + status |= test__divsf3(0xc0c00000, 0x00000000, 0xff800000); + status |= test__divsf3(0xc0c00000, 0x7f800000, 0x80000000); + status |= test__divsf3(0xc0c00000, 0xc0400000, 0x40000000); + status |= test__divsf3(0xc0e00000, 0x40e00000, 0xbf800000); + status |= test__divsf3(0xc1000000, 0x40000000, 0xc0800000); + status |= test__divsf3(0xc1000000, 0x80000000, 0x7f800000); + status |= test__divsf3(0xc1000000, 0xff800000, 0x00000000); + status |= test__divsf3(0xc1100000, 0xc0400000, 0x40400000); + status |= test__divsf3(0xfe7fffff, 0x00000000, 0xff800000); + status |= test__divsf3(0xfe7fffff, 0x7f800000, 0x80000000); + status |= test__divsf3(0xfe800000, 0x00000000, 0xff800000); + status |= test__divsf3(0xfe800000, 0x7f800000, 0x80000000); + status |= test__divsf3(0xfe800000, 0x80000000, 0x7f800000); + status |= test__divsf3(0xfe800000, 0xff800000, 0x00000000); + status |= test__divsf3(0xfeffffff, 0x40000000, 0xfe7fffff); + status |= test__divsf3(0xfeffffff, 0x80000000, 0x7f800000); + status |= test__divsf3(0xff000000, 0x3f000000, 0xff800000); + status |= test__divsf3(0xff000000, 0xbf000000, 0x7f800000); + status |= test__divsf3(0xff000001, 0x7e800001, 0xc0000000); + status |= test__divsf3(0xff7ffffd, 0x40800000, 0xfe7ffffd); + status |= test__divsf3(0xff7ffffd, 0xc0800000, 0x7e7ffffd); + status |= test__divsf3(0xff7fffff, 0x7e7fffff, 0xc0800000); + status |= test__divsf3(0xff7fffff, 0xfe7fffff, 0x40800000); + status |= test__divsf3(0xff7fffff, 0xff800000, 0x00000000); + status |= test__divsf3(0xff800000, 0x00000000, 0xff800000); + status |= test__divsf3(0xff800000, 0x00000003, 0xff800000); + status |= test__divsf3(0xff800000, 0x007fffff, 0xff800000); + status |= test__divsf3(0xff800000, 0x00800001, 0xff800000); + status |= test__divsf3(0xff800000, 0x01000000, 0xff800000); + status |= test__divsf3(0xff800000, 0x40000000, 0xff800000); + status |= test__divsf3(0xff800000, 0x40c00000, 0xff800000); + status |= test__divsf3(0xff800000, 0x7e800000, 0xff800000); + status |= test__divsf3(0xff800000, 0x80000000, 0x7f800000); + status |= test__divsf3(0xff800000, 0x80000004, 0x7f800000); + status |= test__divsf3(0xff800000, 0x807fffff, 0x7f800000); + status |= test__divsf3(0xff800000, 0x80800000, 0x7f800000); + status |= test__divsf3(0xff800000, 0x80ffffff, 0x7f800000); + status |= test__divsf3(0xff800000, 0xc0800000, 0x7f800000); + status |= test__divsf3(0xff800000, 0xc1000000, 0x7f800000); + status |= test__divsf3(0xff800000, 0xfe800000, 0x7f800000); + status |= test__divsf3(0xff800000, 0xff7fffff, 0x7f800000); + status |= test__divsf3(0x2cbed883, 0x333f6113, 0x38ff4953); + status |= test__divsf3(0x3f87ffff, 0x7f001000, 0x0043f781); - // smallest normal value divided by 2.0 - if (test__divsf3(0x1.0p-126F, 2.0F, UINT32_C(0x00400000))) - return 1; - // smallest subnormal result - if (test__divsf3(0x1.0p-126F, 0x1p+23F, UINT32_C(0x00000001))) - return 1; + // Test that the result of an operation is a NaN at all when it should be. + // + // In most configurations these tests' results are checked compared using + // compareResultF, so we set all the answers to the canonical NaN 0x7fc00000, + // which causes compareResultF to accept any NaN encoding. We also use the + // same value as the input NaN in tests that have one, so that even in + // EXPECT_EXACT_RESULTS mode these tests should pass, because 0x7fc00000 is + // still the exact expected NaN. + status |= test__divsf3(0x00000000, 0x00000000, 0x7fc00000); + status |= test__divsf3(0x00000000, 0x80000000, 0x7fc00000); + status |= test__divsf3(0x7f800000, 0x7f800000, 0x7fc00000); + status |= test__divsf3(0x7f800000, 0xff800000, 0x7fc00000); + status |= test__divsf3(0x80000000, 0x00000000, 0x7fc00000); + status |= test__divsf3(0x80000000, 0x80000000, 0x7fc00000); + status |= test__divsf3(0xff800000, 0x7f800000, 0x7fc00000); + status |= test__divsf3(0xff800000, 0xff800000, 0x7fc00000); + status |= test__divsf3(0x3f800000, 0x7fc00000, 0x7fc00000); + status |= test__divsf3(0x7fc00000, 0x3f800000, 0x7fc00000); + status |= test__divsf3(0x7fc00000, 0x7fc00000, 0x7fc00000); - // some misc test cases obtained by fuzzing against h/w implementation - if (test__divsf3(-0x1.3e75e6p-108F, -0x1.cf372p+38F, UINT32_C(0x00000006))) - return 1; - if (test__divsf3(0x1.e77c54p+81F, -0x1.e77c52p-47F, UINT32_C(0xff800000))) - return 1; - if (test__divsf3(0x1.fffffep-126F, 2.F, UINT32_C(0x00800000))) - return 1; +#ifdef ARM_NAN_HANDLING + // Tests specific to the NaN handling of Arm hardware, mimicked by + // arm/divsf3.S: + // + // - a quiet NaN is distinguished by the top mantissa bit being 1 + // + // - if a signalling NaN appears in the input, the output quiet NaN is + // obtained by setting its top mantissa bit and leaving everything else + // unchanged + // + // - if both operands are signalling NaNs then the output NaN is derived + // from the first operand + // + // - if both operands are quiet NaNs then the output NaN is the first + // operand + // + // - invalid operations not involving an input NaN return the quiet + // NaN with fewest bits set, 0x7fc00000. - // test 1 / (1 - eps(0.5)) = 1 + eps(1) - if (test__divsf3(1.0F, 0x1.fffffep-1F, UINT32_C(0x3f800001))) - return 1; + status |= test__divsf3(0x00000000, 0x00000000, 0x7fc00000); + status |= test__divsf3(0x00000000, 0x7fad4be3, 0x7fed4be3); + status |= test__divsf3(0x00000000, 0x7fdf48c7, 0x7fdf48c7); + status |= test__divsf3(0x00000000, 0x80000000, 0x7fc00000); + status |= test__divsf3(0x00000001, 0x7f970eba, 0x7fd70eba); + status |= test__divsf3(0x00000001, 0x7fc35716, 0x7fc35716); + status |= test__divsf3(0x007fffff, 0x7fbf52d6, 0x7fff52d6); + status |= test__divsf3(0x007fffff, 0x7fc7a2df, 0x7fc7a2df); + status |= test__divsf3(0x3f800000, 0x7f987a85, 0x7fd87a85); + status |= test__divsf3(0x3f800000, 0x7fc50124, 0x7fc50124); + status |= test__divsf3(0x7f7fffff, 0x7f95fd6f, 0x7fd5fd6f); + status |= test__divsf3(0x7f7fffff, 0x7ffc28dc, 0x7ffc28dc); + status |= test__divsf3(0x7f800000, 0x7f800000, 0x7fc00000); + status |= test__divsf3(0x7f800000, 0x7f8dd790, 0x7fcdd790); + status |= test__divsf3(0x7f800000, 0x7fd2ef2b, 0x7fd2ef2b); + status |= test__divsf3(0x7f800000, 0xff800000, 0x7fc00000); + status |= test__divsf3(0x7f99b09d, 0x00000000, 0x7fd9b09d); + status |= test__divsf3(0x7f93541e, 0x00000001, 0x7fd3541e); + status |= test__divsf3(0x7f9fc002, 0x007fffff, 0x7fdfc002); + status |= test__divsf3(0x7fb5db77, 0x3f800000, 0x7ff5db77); + status |= test__divsf3(0x7f9f5d92, 0x7f7fffff, 0x7fdf5d92); + status |= test__divsf3(0x7fac7a36, 0x7f800000, 0x7fec7a36); + status |= test__divsf3(0x7fb42008, 0x7fb0ee07, 0x7ff42008); + status |= test__divsf3(0x7f8bd740, 0x7fc7aaf1, 0x7fcbd740); + status |= test__divsf3(0x7f9bb57b, 0x80000000, 0x7fdbb57b); + status |= test__divsf3(0x7f951a78, 0x80000001, 0x7fd51a78); + status |= test__divsf3(0x7f9ba63b, 0x807fffff, 0x7fdba63b); + status |= test__divsf3(0x7f89463c, 0xbf800000, 0x7fc9463c); + status |= test__divsf3(0x7fb63563, 0xff7fffff, 0x7ff63563); + status |= test__divsf3(0x7f90886e, 0xff800000, 0x7fd0886e); + status |= test__divsf3(0x7fe8c15e, 0x00000000, 0x7fe8c15e); + status |= test__divsf3(0x7fe915ae, 0x00000001, 0x7fe915ae); + status |= test__divsf3(0x7ffa9b42, 0x007fffff, 0x7ffa9b42); + status |= test__divsf3(0x7fdad0f5, 0x3f800000, 0x7fdad0f5); + status |= test__divsf3(0x7fd10dcb, 0x7f7fffff, 0x7fd10dcb); + status |= test__divsf3(0x7fd08e8a, 0x7f800000, 0x7fd08e8a); + status |= test__divsf3(0x7fc3a9e6, 0x7f91a816, 0x7fd1a816); + status |= test__divsf3(0x7fdb229c, 0x7fc26c68, 0x7fdb229c); + status |= test__divsf3(0x7fc9f6bb, 0x80000000, 0x7fc9f6bb); + status |= test__divsf3(0x7ffa178b, 0x80000001, 0x7ffa178b); + status |= test__divsf3(0x7fef2a0b, 0x807fffff, 0x7fef2a0b); + status |= test__divsf3(0x7ffc885b, 0xbf800000, 0x7ffc885b); + status |= test__divsf3(0x7fd26e8c, 0xff7fffff, 0x7fd26e8c); + status |= test__divsf3(0x7fc55329, 0xff800000, 0x7fc55329); + status |= test__divsf3(0x80000000, 0x00000000, 0x7fc00000); + status |= test__divsf3(0x80000000, 0x7fa833ae, 0x7fe833ae); + status |= test__divsf3(0x80000000, 0x7fc4df63, 0x7fc4df63); + status |= test__divsf3(0x80000000, 0x80000000, 0x7fc00000); + status |= test__divsf3(0x80000001, 0x7f98827d, 0x7fd8827d); + status |= test__divsf3(0x80000001, 0x7fd7acc5, 0x7fd7acc5); + status |= test__divsf3(0x807fffff, 0x7fad19c0, 0x7fed19c0); + status |= test__divsf3(0x807fffff, 0x7ffe1907, 0x7ffe1907); + status |= test__divsf3(0xbf800000, 0x7fa95487, 0x7fe95487); + status |= test__divsf3(0xbf800000, 0x7fd2bbee, 0x7fd2bbee); + status |= test__divsf3(0xff7fffff, 0x7f86ba21, 0x7fc6ba21); + status |= test__divsf3(0xff7fffff, 0x7feb00d7, 0x7feb00d7); + status |= test__divsf3(0xff800000, 0x7f800000, 0x7fc00000); + status |= test__divsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc); + status |= test__divsf3(0xff800000, 0x7fde0397, 0x7fde0397); + status |= test__divsf3(0xff800000, 0xff800000, 0x7fc00000); +#endif // ARM_NAN_HANDLING - return 0; + return status; } diff --git a/compiler-rt/test/builtins/Unit/mulsf3_test.c b/compiler-rt/test/builtins/Unit/mulsf3_test.c new file mode 100644 index 0000000000000..7dc7c8ad39c32 --- /dev/null +++ b/compiler-rt/test/builtins/Unit/mulsf3_test.c @@ -0,0 +1,616 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// RUN: %clang_builtins %s %librt -o %t && %run %t +// REQUIRES: librt_has_mulsf3 + +#include "int_lib.h" +#include +#include + +#include "fp_test.h" + +// By default this test uses compareResultF to check the returned floats, which +// accepts any returned NaN if the expected result is the canonical NaN value +// 0x7fc00000. For the Arm optimized FP implementation, which commits to a more +// detailed handling of NaNs, we tighten up the check and include some extra +// test cases specific to that NaN policy. +#if (__arm__ && !(__thumb__ && !__thumb2__)) && COMPILER_RT_ARM_OPTIMIZED_FP +# define EXPECT_EXACT_RESULTS +# define ARM_NAN_HANDLING +#endif + +// Returns: a * b +COMPILER_RT_ABI float __mulsf3(float a, float b); + +int test__mulsf3(uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) { + float a = fromRep32(a_rep), b = fromRep32(b_rep); + float x = __mulsf3(a, b); +#ifdef EXPECT_EXACT_RESULTS + int ret = toRep32(x) == expected_rep; +#else + int ret = compareResultF(x, expected_rep); +#endif + + if (ret) { + printf("error in test__mulsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32 + ", expected %08" PRIx32 "\n", + a_rep, b_rep, toRep32(x), expected_rep); + } + return ret; +} + +int main(void) { + int status = 0; + + status |= test__mulsf3(0x00000000, 0x00000000, 0x00000000); + status |= test__mulsf3(0x00000000, 0x007fffff, 0x00000000); + status |= test__mulsf3(0x00000000, 0x00ffffff, 0x00000000); + status |= test__mulsf3(0x00000000, 0x3f800000, 0x00000000); + status |= test__mulsf3(0x00000000, 0x7effffff, 0x00000000); + status |= test__mulsf3(0x00000000, 0x80000000, 0x80000000); + status |= test__mulsf3(0x00000000, 0x80000002, 0x80000000); + status |= test__mulsf3(0x00000000, 0x807fffff, 0x80000000); + status |= test__mulsf3(0x00000000, 0x80800001, 0x80000000); + status |= test__mulsf3(0x00000000, 0x81000000, 0x80000000); + status |= test__mulsf3(0x00000000, 0xc0400000, 0x80000000); + status |= test__mulsf3(0x00000000, 0xfe7fffff, 0x80000000); + status |= test__mulsf3(0x00000000, 0xff000000, 0x80000000); + status |= test__mulsf3(0x00000000, 0xff7fffff, 0x80000000); + status |= test__mulsf3(0x00000001, 0x00000000, 0x00000000); + status |= test__mulsf3(0x00000001, 0x00000001, 0x00000000); + status |= test__mulsf3(0x00000001, 0x3f000000, 0x00000000); + status |= test__mulsf3(0x00000001, 0x3f7fffff, 0x00000001); + status |= test__mulsf3(0x00000001, 0x3f800000, 0x00000001); + status |= test__mulsf3(0x00000001, 0x40000000, 0x00000002); + status |= test__mulsf3(0x00000001, 0x7f800000, 0x7f800000); + status |= test__mulsf3(0x00000001, 0xbf7fffff, 0x80000001); + status |= test__mulsf3(0x00000006, 0x3f000000, 0x00000003); + status |= test__mulsf3(0x00000006, 0xbf000000, 0x80000003); + status |= test__mulsf3(0x00000008, 0x3e000000, 0x00000001); + status |= test__mulsf3(0x007ffff7, 0x81000003, 0x80000000); + status |= test__mulsf3(0x007ffff8, 0x3f800001, 0x007ffff9); + status |= test__mulsf3(0x007ffff8, 0x3f800008, 0x00800000); + status |= test__mulsf3(0x007ffff8, 0xbf800001, 0x807ffff9); + status |= test__mulsf3(0x007ffff8, 0xbf800008, 0x80800000); + status |= test__mulsf3(0x007ffffc, 0x40000000, 0x00fffff8); + status |= test__mulsf3(0x007ffffe, 0x3f7ffffc, 0x007ffffc); + status |= test__mulsf3(0x007ffffe, 0x3f800001, 0x007fffff); + status |= test__mulsf3(0x007ffffe, 0xbf800001, 0x807fffff); + status |= test__mulsf3(0x007fffff, 0x007ffffe, 0x00000000); + status |= test__mulsf3(0x007fffff, 0x3f800001, 0x00800000); + status |= test__mulsf3(0x007fffff, 0x40000000, 0x00fffffe); + status |= test__mulsf3(0x00800000, 0x00000000, 0x00000000); + status |= test__mulsf3(0x00800000, 0x00800000, 0x00000000); + status |= test__mulsf3(0x00800000, 0x3f7ffffe, 0x007fffff); + status |= test__mulsf3(0x00800000, 0x7f800000, 0x7f800000); + status |= test__mulsf3(0x00800000, 0x80800000, 0x80000000); + status |= test__mulsf3(0x00800000, 0xc0000000, 0x81000000); + status |= test__mulsf3(0x00800001, 0x3f7ffffa, 0x007ffffe); + status |= test__mulsf3(0x00800001, 0x3f7ffffe, 0x00800000); + status |= test__mulsf3(0x00800001, 0xc0000000, 0x81000001); + status |= test__mulsf3(0x00800002, 0x3f7ffffc, 0x00800000); + status |= test__mulsf3(0x00fffff8, 0x3f000000, 0x007ffffc); + status |= test__mulsf3(0x00fffffe, 0x3f000000, 0x007fffff); + status |= test__mulsf3(0x00fffffe, 0xbf000000, 0x807fffff); + status |= test__mulsf3(0x00ffffff, 0x3f000000, 0x00800000); + status |= test__mulsf3(0x00ffffff, 0xbf000000, 0x80800000); + status |= test__mulsf3(0x3f000000, 0x80000001, 0x80000000); + status |= test__mulsf3(0x3f800000, 0x007ffffd, 0x007ffffd); + status |= test__mulsf3(0x3f800000, 0x01000003, 0x01000003); + status |= test__mulsf3(0x3f800000, 0x3f800000, 0x3f800000); + status |= test__mulsf3(0x3f800000, 0x40000000, 0x40000000); + status |= test__mulsf3(0x3f800000, 0x80000001, 0x80000001); + status |= test__mulsf3(0x3f800000, 0x80000009, 0x80000009); + status |= test__mulsf3(0x3f800001, 0x3f800001, 0x3f800002); + status |= test__mulsf3(0x3f800001, 0xbf800001, 0xbf800002); + status |= test__mulsf3(0x3f800001, 0xbf800002, 0xbf800003); + status |= test__mulsf3(0x3f800002, 0x3f800001, 0x3f800003); + status |= test__mulsf3(0x3f800002, 0x7f7ffffe, 0x7f800000); + status |= test__mulsf3(0x3f800001, 0x7f7ffffe, 0x7f800000); + status |= test__mulsf3(0x40000000, 0x00800000, 0x01000000); + status |= test__mulsf3(0x40000000, 0x00800001, 0x01000001); + status |= test__mulsf3(0x40000000, 0x3f800000, 0x40000000); + status |= test__mulsf3(0x40000000, 0x40400000, 0x40c00000); + status |= test__mulsf3(0x40000000, 0x7e800000, 0x7f000000); + status |= test__mulsf3(0x40000000, 0x7effffff, 0x7f7fffff); + status |= test__mulsf3(0x40000000, 0x807ffffd, 0x80fffffa); + status |= test__mulsf3(0x40000000, 0x80800003, 0x81000003); + status |= test__mulsf3(0x40000000, 0x80800005, 0x81000005); + status |= test__mulsf3(0x40000000, 0xbf800000, 0xc0000000); + status |= test__mulsf3(0x40000000, 0xfe7ffffd, 0xfefffffd); + status |= test__mulsf3(0x40000000, 0xfe800003, 0xff000003); + status |= test__mulsf3(0x403fffff, 0x3f7ffffd, 0x403ffffd); + status |= test__mulsf3(0x403fffff, 0x3f7ffffe, 0x403ffffe); + status |= test__mulsf3(0x403fffff, 0x3f7fffff, 0x403ffffe); + status |= test__mulsf3(0x403fffff, 0xbf7ffffd, 0xc03ffffd); + status |= test__mulsf3(0x40400000, 0x00000002, 0x00000006); + status |= test__mulsf3(0x40400000, 0x40000000, 0x40c00000); + status |= test__mulsf3(0x40400000, 0x40400000, 0x41100000); + status |= test__mulsf3(0x40400000, 0xc0000000, 0xc0c00000); + status |= test__mulsf3(0x40400001, 0x3f800001, 0x40400003); + status |= test__mulsf3(0x40400001, 0x3f800003, 0x40400006); + status |= test__mulsf3(0x40400001, 0xbf800003, 0xc0400006); + status |= test__mulsf3(0x40800000, 0x00000002, 0x00000008); + status |= test__mulsf3(0x40800000, 0x7e7fffff, 0x7f7fffff); + status |= test__mulsf3(0x40800000, 0xfe7fffff, 0xff7fffff); + status |= test__mulsf3(0x409fffff, 0x3f7fffff, 0x409ffffe); + status |= test__mulsf3(0x40a00000, 0x00000000, 0x00000000); + status |= test__mulsf3(0x40a00000, 0x7f800000, 0x7f800000); + status |= test__mulsf3(0x40a00001, 0x3f800001, 0x40a00002); + status |= test__mulsf3(0x40dfffff, 0x3f7ffffc, 0x40dffffc); + status |= test__mulsf3(0x40dfffff, 0x3f7fffff, 0x40dffffe); + status |= test__mulsf3(0x40e00000, 0x80000000, 0x80000000); + status |= test__mulsf3(0x40e00000, 0xff800000, 0xff800000); + status |= test__mulsf3(0x40e00001, 0x3f800001, 0x40e00003); + status |= test__mulsf3(0x7e7ffffd, 0x40800000, 0x7f7ffffd); + status |= test__mulsf3(0x7e7ffffd, 0xc0800000, 0xff7ffffd); + status |= test__mulsf3(0x7e800000, 0xc0000000, 0xff000000); + status |= test__mulsf3(0x7efffffd, 0xc0000008, 0xff800000); + status |= test__mulsf3(0x7effffff, 0xc0000000, 0xff7fffff); + status |= test__mulsf3(0x7f000000, 0x00000000, 0x00000000); + status |= test__mulsf3(0x7f000000, 0x40000000, 0x7f800000); + status |= test__mulsf3(0x7f000000, 0x7f000000, 0x7f800000); + status |= test__mulsf3(0x7f000000, 0x7f7ffffe, 0x7f800000); + status |= test__mulsf3(0x7f000000, 0x7f800000, 0x7f800000); + status |= test__mulsf3(0x7f000000, 0xfe800000, 0xff800000); + status |= test__mulsf3(0x7f000000, 0xfe800004, 0xff800000); + status |= test__mulsf3(0x7f000000, 0xff000000, 0xff800000); + status |= test__mulsf3(0x7f000009, 0x7f7ffffa, 0x7f800000); + status |= test__mulsf3(0x7f000009, 0xc0c00002, 0xff800000); + status |= test__mulsf3(0x7f7fffff, 0x00000000, 0x00000000); + status |= test__mulsf3(0x7f800000, 0x007fffff, 0x7f800000); + status |= test__mulsf3(0x7f800000, 0x00ffffff, 0x7f800000); + status |= test__mulsf3(0x7f800000, 0x3f800000, 0x7f800000); + status |= test__mulsf3(0x7f800000, 0x7effffff, 0x7f800000); + status |= test__mulsf3(0x7f800000, 0x7f800000, 0x7f800000); + status |= test__mulsf3(0x7f800000, 0x80000002, 0xff800000); + status |= test__mulsf3(0x7f800000, 0x807fffff, 0xff800000); + status |= test__mulsf3(0x7f800000, 0x80800001, 0xff800000); + status |= test__mulsf3(0x7f800000, 0x81000000, 0xff800000); + status |= test__mulsf3(0x7f800000, 0xc0400000, 0xff800000); + status |= test__mulsf3(0x7f800000, 0xff000000, 0xff800000); + status |= test__mulsf3(0x7f800000, 0xff7fffff, 0xff800000); + status |= test__mulsf3(0x7f800000, 0xff800000, 0xff800000); + status |= test__mulsf3(0x80000000, 0x00000000, 0x80000000); + status |= test__mulsf3(0x80000000, 0x40c00000, 0x80000000); + status |= test__mulsf3(0x80000000, 0x7f7fffff, 0x80000000); + status |= test__mulsf3(0x80000000, 0x80000000, 0x00000000); + status |= test__mulsf3(0x80000000, 0x80000004, 0x00000000); + status |= test__mulsf3(0x80000000, 0x80800000, 0x00000000); + status |= test__mulsf3(0x80000000, 0xc1000000, 0x00000000); + status |= test__mulsf3(0x80000000, 0xfe800000, 0x00000000); + status |= test__mulsf3(0x80000001, 0x00000001, 0x80000000); + status |= test__mulsf3(0x80000001, 0x40a00000, 0x80000005); + status |= test__mulsf3(0x80000002, 0x3f800000, 0x80000002); + status |= test__mulsf3(0x80000003, 0x00000000, 0x80000000); + status |= test__mulsf3(0x80000003, 0x7f800000, 0xff800000); + status |= test__mulsf3(0x80000004, 0xbf800000, 0x00000004); + status |= test__mulsf3(0x80000008, 0x3e000000, 0x80000001); + status |= test__mulsf3(0x807ffff7, 0x01000003, 0x80000000); + status |= test__mulsf3(0x807ffff7, 0x3f800001, 0x807ffff8); + status |= test__mulsf3(0x807ffffd, 0xc0000000, 0x00fffffa); + status |= test__mulsf3(0x807fffff, 0x00000000, 0x80000000); + status |= test__mulsf3(0x807fffff, 0x3f800001, 0x80800000); + status |= test__mulsf3(0x807fffff, 0x7f800000, 0xff800000); + status |= test__mulsf3(0x807fffff, 0x80000000, 0x00000000); + status |= test__mulsf3(0x807fffff, 0x807ffffe, 0x00000000); + status |= test__mulsf3(0x807fffff, 0xbf800000, 0x007fffff); + status |= test__mulsf3(0x807fffff, 0xff800000, 0x7f800000); + status |= test__mulsf3(0x80800000, 0x00800000, 0x80000000); + status |= test__mulsf3(0x80800000, 0x80800000, 0x00000000); + status |= test__mulsf3(0x80800001, 0x00000000, 0x80000000); + status |= test__mulsf3(0x80800001, 0x7f800000, 0xff800000); + status |= test__mulsf3(0x80800001, 0xbf800000, 0x00800001); + status |= test__mulsf3(0x80fffffc, 0x3f000000, 0x807ffffe); + status |= test__mulsf3(0x80fffffc, 0xbf000000, 0x007ffffe); + status |= test__mulsf3(0x80fffffe, 0x3f800000, 0x80fffffe); + status |= test__mulsf3(0x80ffffff, 0x80000000, 0x00000000); + status |= test__mulsf3(0x80ffffff, 0xff800000, 0x7f800000); + status |= test__mulsf3(0x81000000, 0x00000000, 0x80000000); + status |= test__mulsf3(0x81000000, 0x7f800000, 0xff800000); + status |= test__mulsf3(0xbf7fffff, 0xff7fffff, 0x7f7ffffe); + status |= test__mulsf3(0xbf800000, 0x00000009, 0x80000009); + status |= test__mulsf3(0xbf800000, 0x00800009, 0x80800009); + status |= test__mulsf3(0xbf800000, 0x3f800000, 0xbf800000); + status |= test__mulsf3(0xbf800000, 0x40000000, 0xc0000000); + status |= test__mulsf3(0xbf800000, 0xbf800000, 0x3f800000); + status |= test__mulsf3(0xbf800000, 0xc0000000, 0x40000000); + status |= test__mulsf3(0xbf800001, 0x3f800001, 0xbf800002); + status |= test__mulsf3(0xbf800001, 0xbf800001, 0x3f800002); + status |= test__mulsf3(0xbf800001, 0xbf800002, 0x3f800003); + status |= test__mulsf3(0xbf800002, 0x3f800001, 0xbf800003); + status |= test__mulsf3(0xbf800002, 0xbf800001, 0x3f800003); + status |= test__mulsf3(0xc0000000, 0x00000000, 0x80000000); + status |= test__mulsf3(0xc0000000, 0x007ffffd, 0x80fffffa); + status |= test__mulsf3(0xc0000000, 0x00800001, 0x81000001); + status |= test__mulsf3(0xc0000000, 0x00800005, 0x81000005); + status |= test__mulsf3(0xc0000000, 0x00800009, 0x81000009); + status |= test__mulsf3(0xc0000000, 0x40400000, 0xc0c00000); + status |= test__mulsf3(0xc0000000, 0x7e7fffff, 0xfeffffff); + status |= test__mulsf3(0xc0000000, 0x7e800001, 0xff000001); + status |= test__mulsf3(0xc0000000, 0x7f800000, 0xff800000); + status |= test__mulsf3(0xc0000000, 0xbf800000, 0x40000000); + status |= test__mulsf3(0xc0000000, 0xc0400000, 0x40c00000); + status |= test__mulsf3(0xc03ffffe, 0x7f000000, 0xff800000); + status |= test__mulsf3(0xc03fffff, 0x3f7fffff, 0xc03ffffe); + status |= test__mulsf3(0xc0400000, 0x40400000, 0xc1100000); + status |= test__mulsf3(0xc0400000, 0xc0000000, 0x40c00000); + status |= test__mulsf3(0xc0400000, 0xc0400000, 0x41100000); + status |= test__mulsf3(0xc0400000, 0xff000000, 0x7f800000); + status |= test__mulsf3(0xc0400001, 0x3f800001, 0xc0400003); + status |= test__mulsf3(0xc0800000, 0x7e7fffff, 0xff7fffff); + status |= test__mulsf3(0xc0800000, 0x80000000, 0x00000000); + status |= test__mulsf3(0xc0800000, 0xfe7fffff, 0x7f7fffff); + status |= test__mulsf3(0xc0800000, 0xff800000, 0x7f800000); + status |= test__mulsf3(0xc09ffffe, 0xff000000, 0x7f800000); + status |= test__mulsf3(0xc09fffff, 0xbf7fffff, 0x409ffffe); + status |= test__mulsf3(0xc0a00001, 0xbf800001, 0x40a00002); + status |= test__mulsf3(0xc0dffff9, 0x7f000000, 0xff800000); + status |= test__mulsf3(0xc1100000, 0x7f000000, 0xff800000); + status |= test__mulsf3(0xc1100001, 0xff000000, 0x7f800000); + status |= test__mulsf3(0xfe7ffff9, 0x7f000000, 0xff800000); + status |= test__mulsf3(0xfe7ffff9, 0xc07fffff, 0x7f7ffff8); + status |= test__mulsf3(0xfe7ffffd, 0x40800000, 0xff7ffffd); + status |= test__mulsf3(0xfe7ffffd, 0xc0800000, 0x7f7ffffd); + status |= test__mulsf3(0xfe7fffff, 0x00000000, 0x80000000); + status |= test__mulsf3(0xfe7fffff, 0x40000001, 0xff000000); + status |= test__mulsf3(0xfe7fffff, 0x7f800000, 0xff800000); + status |= test__mulsf3(0xfe800000, 0x00000000, 0x80000000); + status |= test__mulsf3(0xfe800000, 0x7f800000, 0xff800000); + status |= test__mulsf3(0xfefffff7, 0x7e800001, 0xff800000); + status |= test__mulsf3(0xfeffffff, 0x3f800001, 0xff000000); + status |= test__mulsf3(0xfeffffff, 0x80000000, 0x00000000); + status |= test__mulsf3(0xff000005, 0xff000001, 0x7f800000); + status |= test__mulsf3(0xff7ffffd, 0x7f000000, 0xff800000); + status |= test__mulsf3(0xff7ffffd, 0xc0400001, 0x7f800000); + status |= test__mulsf3(0xff7ffffd, 0xff000001, 0x7f800000); + status |= test__mulsf3(0xff7fffff, 0x80000000, 0x00000000); + status |= test__mulsf3(0xff7fffff, 0xff7fffff, 0x7f800000); + status |= test__mulsf3(0xff7fffff, 0xff800000, 0x7f800000); + status |= test__mulsf3(0xff800000, 0x40c00000, 0xff800000); + status |= test__mulsf3(0xff800000, 0x7f800000, 0xff800000); + status |= test__mulsf3(0xff800000, 0x80000004, 0x7f800000); + status |= test__mulsf3(0xff800000, 0x80800000, 0x7f800000); + status |= test__mulsf3(0xff800000, 0xc1000000, 0x7f800000); + status |= test__mulsf3(0xff800000, 0xfe800000, 0x7f800000); + status |= test__mulsf3(0xff800000, 0xff800000, 0x7f800000); + status |= test__mulsf3(0x3089705f, 0x0ef36390, 0x0041558f); + status |= test__mulsf3(0x3089705f, 0x0e936390, 0x0027907d); + status |= test__mulsf3(0x3109705f, 0x0ef36390, 0x0082ab1e); + status |= test__mulsf3(0x3109705f, 0x0e936390, 0x004f20fa); + status |= test__mulsf3(0x3189705f, 0x0ef36390, 0x0102ab1e); + status |= test__mulsf3(0x3189705f, 0x0e936390, 0x009e41f5); + status |= test__mulsf3(0xb089705f, 0x0ef36390, 0x8041558f); + status |= test__mulsf3(0xb089705f, 0x0e936390, 0x8027907d); + status |= test__mulsf3(0xb109705f, 0x0ef36390, 0x8082ab1e); + status |= test__mulsf3(0xb109705f, 0x0e936390, 0x804f20fa); + status |= test__mulsf3(0xb189705f, 0x0ef36390, 0x8102ab1e); + status |= test__mulsf3(0xb189705f, 0x0e936390, 0x809e41f5); + status |= test__mulsf3(0x3089705f, 0x8ef36390, 0x8041558f); + status |= test__mulsf3(0x3089705f, 0x8e936390, 0x8027907d); + status |= test__mulsf3(0x3109705f, 0x8ef36390, 0x8082ab1e); + status |= test__mulsf3(0x3109705f, 0x8e936390, 0x804f20fa); + status |= test__mulsf3(0x3189705f, 0x8ef36390, 0x8102ab1e); + status |= test__mulsf3(0x3189705f, 0x8e936390, 0x809e41f5); + status |= test__mulsf3(0xb089705f, 0x8ef36390, 0x0041558f); + status |= test__mulsf3(0xb089705f, 0x8e936390, 0x0027907d); + status |= test__mulsf3(0xb109705f, 0x8ef36390, 0x0082ab1e); + status |= test__mulsf3(0xb109705f, 0x8e936390, 0x004f20fa); + status |= test__mulsf3(0xb189705f, 0x8ef36390, 0x0102ab1e); + status |= test__mulsf3(0xb189705f, 0x8e936390, 0x009e41f5); + status |= test__mulsf3(0x1f800001, 0x1fc00000, 0x00300000); + status |= test__mulsf3(0x1f800003, 0x1fc00000, 0x00300001); + status |= test__mulsf3(0x1f800001, 0x1fc00800, 0x00300200); + status |= test__mulsf3(0x1f800003, 0x1fc00800, 0x00300201); + status |= test__mulsf3(0x36e4588a, 0x29b47cbd, 0x2120fd85); + status |= test__mulsf3(0x3fea3b26, 0x3f400000, 0x3fafac5c); + status |= test__mulsf3(0x6fea3b26, 0x4f400000, 0x7f800000); + status |= test__mulsf3(0x20ea3b26, 0x1ec00000, 0x0057d62e); + status |= test__mulsf3(0x3f8f11bb, 0x3fc00000, 0x3fd69a98); + status |= test__mulsf3(0x6f8f11bb, 0x4fc00000, 0x7f800000); + status |= test__mulsf3(0x208f11bb, 0x1f400000, 0x006b4d4c); + status |= test__mulsf3(0x3f8f11bb, 0x3f800000, 0x3f8f11bb); + status |= test__mulsf3(0x6f8f11bb, 0x4f800000, 0x7f800000); + status |= test__mulsf3(0x208f11bb, 0x1f000000, 0x004788de); + status |= test__mulsf3(0x3f8f11bb, 0x3fd7f48d, 0x3ff1611f); + status |= test__mulsf3(0x6f8f11bb, 0x4fd7f48d, 0x7f800000); + status |= test__mulsf3(0x208f11bb, 0x1f57f48d, 0x0078b090); + status |= test__mulsf3(0x3f8f11bb, 0x3fa80b73, 0x3fbbd412); + status |= test__mulsf3(0x6f8f11bb, 0x4fa80b73, 0x7f800000); + status |= test__mulsf3(0x208f11bb, 0x1f280b73, 0x005dea09); + status |= test__mulsf3(0x3f8f11bb, 0x3f97f48d, 0x3fa9d842); + status |= test__mulsf3(0x6f8f11bb, 0x4f97f48d, 0x7f800000); + status |= test__mulsf3(0x208f11bb, 0x1f17f48d, 0x0054ec21); + status |= test__mulsf3(0x3f8f11bb, 0x3f680b73, 0x3f81ae78); + status |= test__mulsf3(0x6f8f11bb, 0x4f680b73, 0x7f800000); + status |= test__mulsf3(0x208f11bb, 0x1ee80b73, 0x0040d73c); + status |= test__mulsf3(0x3fff5dd8, 0x3f600000, 0x3fdf721d); + status |= test__mulsf3(0x6fff5dd8, 0x4f600000, 0x7f800000); + status |= test__mulsf3(0x20ff5dd8, 0x1ee00000, 0x006fb90e); + status |= test__mulsf3(0x3fff5dd8, 0x3f100000, 0x3f8fa4ca); + status |= test__mulsf3(0x6fff5dd8, 0x4f100000, 0x7f800000); + status |= test__mulsf3(0x20ff5dd8, 0x1e900000, 0x0047d265); + status |= test__mulsf3(0x3fffe96b, 0x3f7efb43, 0x3ffee4c5); + status |= test__mulsf3(0x6fffe96b, 0x4f7efb43, 0x7f800000); + status |= test__mulsf3(0x20ffe96b, 0x1efefb43, 0x007f7263); + status |= test__mulsf3(0x3fffe96b, 0x3f0104bd, 0x3f80f95b); + status |= test__mulsf3(0x6fffe96b, 0x4f0104bd, 0x7f800000); + status |= test__mulsf3(0x20ffe96b, 0x1e8104bd, 0x00407cae); + status |= test__mulsf3(0x3f8fbbb7, 0x3fa6edf9, 0x3fbb72aa); + status |= test__mulsf3(0x6f8fbbb7, 0x4fa6edf9, 0x7f800000); + status |= test__mulsf3(0x208fbbb7, 0x1f26edf9, 0x005db955); + status |= test__mulsf3(0x3f8fbbb7, 0x3fd91207, 0x3ff3c07b); + status |= test__mulsf3(0x6f8fbbb7, 0x4fd91207, 0x7f800000); + status |= test__mulsf3(0x208fbbb7, 0x1f591207, 0x0079e03d); + status |= test__mulsf3(0x3f8fbbb7, 0x3f991207, 0x3fabe29f); + status |= test__mulsf3(0x6f8fbbb7, 0x4f991207, 0x7f800000); + status |= test__mulsf3(0x208fbbb7, 0x1f191207, 0x0055f150); + status |= test__mulsf3(0x3f8fbbb7, 0x3f66edf9, 0x3f81a843); + status |= test__mulsf3(0x6f8fbbb7, 0x4f66edf9, 0x7f800000); + status |= test__mulsf3(0x208fbbb7, 0x1ee6edf9, 0x0040d421); + status |= test__mulsf3(0x3fdb62f3, 0x3f7879c5, 0x3fd4f036); + status |= test__mulsf3(0x6fdb62f3, 0x4f7879c5, 0x7f800000); + status |= test__mulsf3(0x20db62f3, 0x1ef879c5, 0x006a781b); + status |= test__mulsf3(0x3faaea45, 0x3f8b6773, 0x3fba2489); + status |= test__mulsf3(0x6faaea45, 0x4f8b6773, 0x7f800000); + status |= test__mulsf3(0x20aaea45, 0x1f0b6773, 0x005d1244); + status |= test__mulsf3(0x3fafa7ec, 0x3f900000, 0x3fc59cea); + status |= test__mulsf3(0x6fafa7ec, 0x4f900000, 0x7f800000); + status |= test__mulsf3(0x20afa7ec, 0x1f100000, 0x0062ce75); + status |= test__mulsf3(0x3fcf8c8d, 0x3f271645, 0x3f8776be); + status |= test__mulsf3(0x6fcf8c8d, 0x4f271645, 0x7f800000); + status |= test__mulsf3(0x20cf8c8d, 0x1ea71645, 0x0043bb5f); + status |= test__mulsf3(0x3fc173ef, 0x3f901b0f, 0x3fd9cb52); + status |= test__mulsf3(0x6fc173ef, 0x4f901b0f, 0x7f800000); + status |= test__mulsf3(0x20c173ef, 0x1f101b0f, 0x006ce5a9); + status |= test__mulsf3(0x3fb48d33, 0x3f4a35fb, 0x3f8e9d7d); + status |= test__mulsf3(0x6fb48d33, 0x4f4a35fb, 0x7f800000); + status |= test__mulsf3(0x20b48d33, 0x1eca35fb, 0x00474ebe); + status |= test__mulsf3(0x3fc6f87b, 0x3f65d94d, 0x3fb2a52a); + status |= test__mulsf3(0x6fc6f87b, 0x4f65d94d, 0x7f800000); + status |= test__mulsf3(0x20c6f87b, 0x1ee5d94d, 0x00595295); + status |= test__mulsf3(0x3f860ae7, 0x3f969729, 0x3f9db312); + status |= test__mulsf3(0x6f860ae7, 0x4f969729, 0x7f800000); + status |= test__mulsf3(0x20860ae7, 0x1f169729, 0x004ed989); + status |= test__mulsf3(0x3f860ae7, 0x3fc00000, 0x3fc9105a); + status |= test__mulsf3(0x6f860ae7, 0x4fc00000, 0x7f800000); + status |= test__mulsf3(0x20860ae7, 0x1f400000, 0x0064882d); + status |= test__mulsf3(0x3f860ae7, 0x3fe968d7, 0x3ff46da3); + status |= test__mulsf3(0x6f860ae7, 0x4fe968d7, 0x7f800000); + status |= test__mulsf3(0x20860ae7, 0x1f6968d7, 0x007a36d1); + status |= test__mulsf3(0x3f860ae7, 0x3f800000, 0x3f860ae7); + status |= test__mulsf3(0x6f860ae7, 0x4f800000, 0x7f800000); + status |= test__mulsf3(0x20860ae7, 0x1f000000, 0x00430574); + status |= test__mulsf3(0x3f860ae7, 0x3fa968d7, 0x3fb1682f); + status |= test__mulsf3(0x6f860ae7, 0x4fa968d7, 0x7f800000); + status |= test__mulsf3(0x20860ae7, 0x1f2968d7, 0x0058b418); + status |= test__mulsf3(0x3f860ae7, 0x3fd69729, 0x3fe0b886); + status |= test__mulsf3(0x6f860ae7, 0x4fd69729, 0x7f800000); + status |= test__mulsf3(0x20860ae7, 0x1f569729, 0x00705c43); + status |= test__mulsf3(0x3f9aecdd, 0x3fb14b75, 0x3fd696de); + status |= test__mulsf3(0x6f9aecdd, 0x4fb14b75, 0x7f800000); + status |= test__mulsf3(0x209aecdd, 0x1f314b75, 0x006b4b6f); + status |= test__mulsf3(0x3f9aecdd, 0x3fceb48b, 0x3ffa2fb9); + status |= test__mulsf3(0x6f9aecdd, 0x4fceb48b, 0x7f800000); + status |= test__mulsf3(0x209aecdd, 0x1f4eb48b, 0x007d17dc); + status |= test__mulsf3(0x3f9aecdd, 0x3fc00000, 0x3fe8634c); + status |= test__mulsf3(0x6f9aecdd, 0x4fc00000, 0x7f800000); + status |= test__mulsf3(0x209aecdd, 0x1f400000, 0x007431a6); + status |= test__mulsf3(0x3fd65dc6, 0x3f400000, 0x3fa0c654); + status |= test__mulsf3(0x6fd65dc6, 0x4f400000, 0x7f800000); + status |= test__mulsf3(0x20d65dc6, 0x1ec00000, 0x0050632a); + status |= test__mulsf3(0x3feecf03, 0x3f5f93ab, 0x3fd09014); + status |= test__mulsf3(0x6feecf03, 0x4f5f93ab, 0x7f800000); + status |= test__mulsf3(0x20eecf03, 0x1edf93ab, 0x0068480a); + status |= test__mulsf3(0x3feecf03, 0x3f206c55, 0x3f95a670); + status |= test__mulsf3(0x6feecf03, 0x4f206c55, 0x7f800000); + status |= test__mulsf3(0x20eecf03, 0x1ea06c55, 0x004ad338); + status |= test__mulsf3(0x3f98feed, 0x3f60f11b, 0x3f866f27); + status |= test__mulsf3(0x6f98feed, 0x4f60f11b, 0x7f800000); + status |= test__mulsf3(0x2098feed, 0x1ee0f11b, 0x00433794); + status |= test__mulsf3(0x3f9a1b9d, 0x3f9c42b5, 0x3fbc21f8); + status |= test__mulsf3(0x6f9a1b9d, 0x4f9c42b5, 0x7f800000); + status |= test__mulsf3(0x209a1b9d, 0x1f1c42b5, 0x005e10fc); + status |= test__mulsf3(0x3f9a1b9d, 0x3f5c42b5, 0x3f8497e3); + status |= test__mulsf3(0x6f9a1b9d, 0x4f5c42b5, 0x7f800000); + status |= test__mulsf3(0x209a1b9d, 0x1edc42b5, 0x00424bf2); + status |= test__mulsf3(0x3f947044, 0x3f600000, 0x3f81e23c); + status |= test__mulsf3(0x6f947044, 0x4f600000, 0x7f800000); + status |= test__mulsf3(0x20947044, 0x1ee00000, 0x0040f11e); + status |= test__mulsf3(0x3fa3fb77, 0x3f6eb1b9, 0x3f98e5a0); + status |= test__mulsf3(0x6fa3fb77, 0x4f6eb1b9, 0x7f800000); + status |= test__mulsf3(0x20a3fb77, 0x1eeeb1b9, 0x004c72d0); + status |= test__mulsf3(0x3fb291df, 0x3f466a1f, 0x3f8a66d9); + status |= test__mulsf3(0x6fb291df, 0x4f466a1f, 0x7f800000); + status |= test__mulsf3(0x20b291df, 0x1ec66a1f, 0x0045336c); + status |= test__mulsf3(0x3fde13d5, 0x3f6b7283, 0x3fcc3f8b); + status |= test__mulsf3(0x6fde13d5, 0x4f6b7283, 0x7f800000); + status |= test__mulsf3(0x20de13d5, 0x1eeb7283, 0x00661fc5); + status |= test__mulsf3(0x3fd5b211, 0x3f80810f, 0x3fd68987); + status |= test__mulsf3(0x6fd5b211, 0x4f80810f, 0x7f800000); + status |= test__mulsf3(0x20d5b211, 0x1f00810f, 0x006b44c4); + status |= test__mulsf3(0x3fd5b211, 0x3f3f7ef1, 0x3f9fd9d2); + status |= test__mulsf3(0x6fd5b211, 0x4f3f7ef1, 0x7f800000); + status |= test__mulsf3(0x20d5b211, 0x1ebf7ef1, 0x004fece9); + status |= test__mulsf3(0x3fadfbc4, 0x3f400000, 0x3f827cd3); + status |= test__mulsf3(0x6fadfbc4, 0x4f400000, 0x7f800000); + status |= test__mulsf3(0x20adfbc4, 0x1ec00000, 0x00413e6a); + status |= test__mulsf3(0x3fd0ef03, 0x3f800000, 0x3fd0ef03); + status |= test__mulsf3(0x6fd0ef03, 0x4f800000, 0x7f800000); + status |= test__mulsf3(0x20d0ef03, 0x1f000000, 0x00687782); + status |= test__mulsf3(0x3fd0ef03, 0x3f8673ab, 0x3fdb7705); + status |= test__mulsf3(0x6fd0ef03, 0x4f8673ab, 0x7f800000); + status |= test__mulsf3(0x20d0ef03, 0x1f0673ab, 0x006dbb83); + status |= test__mulsf3(0x3fd0ef03, 0x3f798c55, 0x3fcbab02); + status |= test__mulsf3(0x6fd0ef03, 0x4f798c55, 0x7f800000); + status |= test__mulsf3(0x20d0ef03, 0x1ef98c55, 0x0065d581); + status |= test__mulsf3(0x3fdd1181, 0x3f8ad17f, 0x3fefc0b1); + status |= test__mulsf3(0x6fdd1181, 0x4f8ad17f, 0x7f800000); + status |= test__mulsf3(0x20dd1181, 0x1f0ad17f, 0x0077e058); + status |= test__mulsf3(0x3fdd1181, 0x3f752e81, 0x3fd3b9e9); + status |= test__mulsf3(0x6fdd1181, 0x4f752e81, 0x7f800000); + status |= test__mulsf3(0x20dd1181, 0x1ef52e81, 0x0069dcf5); + status |= test__mulsf3(0x3f92efc6, 0x3fa00000, 0x3fb7abb8); + status |= test__mulsf3(0x6f92efc6, 0x4fa00000, 0x7f800000); + status |= test__mulsf3(0x2092efc6, 0x1f200000, 0x005bd5dc); + status |= test__mulsf3(0x3fdcefe6, 0x3f400000, 0x3fa5b3ec); + status |= test__mulsf3(0x6fdcefe6, 0x4f400000, 0x7f800000); + status |= test__mulsf3(0x20dcefe6, 0x1ec00000, 0x0052d9f6); + status |= test__mulsf3(0x3fad6507, 0x3fa2f8b7, 0x3fdcc4c9); + status |= test__mulsf3(0x6fad6507, 0x4fa2f8b7, 0x7f800000); + status |= test__mulsf3(0x20ad6507, 0x1f22f8b7, 0x006e6264); + status |= test__mulsf3(0x3fad6507, 0x3f62f8b7, 0x3f99bba6); + status |= test__mulsf3(0x6fad6507, 0x4f62f8b7, 0x7f800000); + status |= test__mulsf3(0x20ad6507, 0x1ee2f8b7, 0x004cddd3); + status |= test__mulsf3(0x3fbfde6b, 0x3f8721bd, 0x3fca8f27); + status |= test__mulsf3(0x6fbfde6b, 0x4f8721bd, 0x7f800000); + status |= test__mulsf3(0x20bfde6b, 0x1f0721bd, 0x00654794); + status |= test__mulsf3(0x3fbfde6b, 0x3f4721bd, 0x3f953f2e); + status |= test__mulsf3(0x6fbfde6b, 0x4f4721bd, 0x7f800000); + status |= test__mulsf3(0x20bfde6b, 0x1ec721bd, 0x004a9f97); + status |= test__mulsf3(0x3ff40db4, 0x3f400000, 0x3fb70a47); + status |= test__mulsf3(0x6ff40db4, 0x4f400000, 0x7f800000); + status |= test__mulsf3(0x20f40db4, 0x1ec00000, 0x005b8524); + status |= test__mulsf3(0x3ff40db4, 0x3f600000, 0x3fd58bfe); + status |= test__mulsf3(0x6ff40db4, 0x4f600000, 0x7f800000); + status |= test__mulsf3(0x20f40db4, 0x1ee00000, 0x006ac5ff); + status |= test__mulsf3(0x3f9e20d3, 0x3f90c8a5, 0x3fb2dccc); + status |= test__mulsf3(0x6f9e20d3, 0x4f90c8a5, 0x7f800000); + status |= test__mulsf3(0x209e20d3, 0x1f10c8a5, 0x00596e66); + status |= test__mulsf3(0x3f9e20d3, 0x3fc00000, 0x3fed313c); + status |= test__mulsf3(0x6f9e20d3, 0x4fc00000, 0x7f800000); + status |= test__mulsf3(0x209e20d3, 0x1f400000, 0x0076989e); + status |= test__mulsf3(0x3f9e20d3, 0x3f50c8a5, 0x3f80f69b); + status |= test__mulsf3(0x6f9e20d3, 0x4f50c8a5, 0x7f800000); + status |= test__mulsf3(0x209e20d3, 0x1ed0c8a5, 0x00407b4d); + status |= test__mulsf3(0x3f82e641, 0x3f8fd63f, 0x3f931856); + status |= test__mulsf3(0x6f82e641, 0x4f8fd63f, 0x7f800000); + status |= test__mulsf3(0x2082e641, 0x1f0fd63f, 0x00498c2b); + status |= test__mulsf3(0x3f9a1901, 0x3f96e701, 0x3fb5ab68); + status |= test__mulsf3(0x6f9a1901, 0x4f96e701, 0x7f800000); + status |= test__mulsf3(0x209a1901, 0x1f16e701, 0x005ad5b4); + status |= test__mulsf3(0x3fa21aa1, 0x3f7c4961, 0x3f9fc0ae); + status |= test__mulsf3(0x6fa21aa1, 0x4f7c4961, 0x7f800000); + status |= test__mulsf3(0x20a21aa1, 0x1efc4961, 0x004fe057); + status |= test__mulsf3(0x3fcd0767, 0x3f782457, 0x3fc6bc47); + status |= test__mulsf3(0x6fcd0767, 0x4f782457, 0x7f800000); + status |= test__mulsf3(0x20cd0767, 0x1ef82457, 0x00635e23); + status |= test__mulsf3(0x3fb875e1, 0x3f968e21, 0x3fd8f6f6); + status |= test__mulsf3(0x6fb875e1, 0x4f968e21, 0x7f800000); + status |= test__mulsf3(0x20b875e1, 0x1f168e21, 0x006c7b7b); + status |= test__mulsf3(0x3fc2f0d7, 0x3f5efd19, 0x3fa9cd95); + status |= test__mulsf3(0x6fc2f0d7, 0x4f5efd19, 0x7f800000); + status |= test__mulsf3(0x20c2f0d7, 0x1edefd19, 0x0054e6cb); + status |= test__mulsf3(0x7f7ffffe, 0x3f800001, 0x7f800000); + status |= test__mulsf3(0x00000003, 0xc00fffff, 0x80000007); + status |= test__mulsf3(0x00000003, 0x400fffff, 0x00000007); + status |= test__mulsf3(0x80000003, 0xc00fffff, 0x00000007); + status |= test__mulsf3(0x80000003, 0x400fffff, 0x80000007); + status |= test__mulsf3(0x00000003, 0xc00ffffd, 0x80000007); + status |= test__mulsf3(0x00000003, 0x400ffffd, 0x00000007); + status |= test__mulsf3(0x80000003, 0xc00ffffd, 0x00000007); + status |= test__mulsf3(0x80000003, 0x400ffffd, 0x80000007); + status |= test__mulsf3(0x3e00007f, 0x017c0000, 0x003f003f); + status |= test__mulsf3(0xcf7fff00, 0xc0ffff00, 0x50fffe00); + status |= test__mulsf3(0x3fdf7f00, 0x3fffff00, 0x405f7e21); + status |= test__mulsf3(0x19b92144, 0x1a310000, 0x00000001); + status |= test__mulsf3(0x19ffc008, 0x1a002004, 0x00000001); + status |= test__mulsf3(0x7f7ffff0, 0xc0000008, 0xff800000); + + // Test that the result of an operation is a NaN at all when it should be. + // + // In most configurations these tests' results are checked compared using + // compareResultF, so we set all the answers to the canonical NaN 0x7fc00000, + // which causes compareResultF to accept any NaN encoding. We also use the + // same value as the input NaN in tests that have one, so that even in + // EXPECT_EXACT_RESULTS mode these tests should pass, because 0x7fc00000 is + // still the exact expected NaN. + status |= test__mulsf3(0x7f800000, 0x00000000, 0x7fc00000); + status |= test__mulsf3(0x7f800000, 0x80000000, 0x7fc00000); + status |= test__mulsf3(0x80000000, 0x7f800000, 0x7fc00000); + status |= test__mulsf3(0x80000000, 0xff800000, 0x7fc00000); + status |= test__mulsf3(0x3f800000, 0x7fc00000, 0x7fc00000); + status |= test__mulsf3(0x7fc00000, 0x3f800000, 0x7fc00000); + status |= test__mulsf3(0x7fc00000, 0x7fc00000, 0x7fc00000); + +#ifdef ARM_NAN_HANDLING + // Tests specific to the NaN handling of Arm hardware, mimicked by + // arm/mulsf3.S: + // + // - a quiet NaN is distinguished by the top mantissa bit being 1 + // + // - if a signalling NaN appears in the input, the output quiet NaN is + // obtained by setting its top mantissa bit and leaving everything else + // unchanged + // + // - if both operands are signalling NaNs then the output NaN is derived + // from the first operand + // + // - if both operands are quiet NaNs then the output NaN is the first + // operand + // + // - invalid operations not involving an input NaN return the quiet + // NaN with fewest bits set, 0x7fc00000. + + status |= test__mulsf3(0x00000000, 0x7fad4be3, 0x7fed4be3); + status |= test__mulsf3(0x00000000, 0x7fdf48c7, 0x7fdf48c7); + status |= test__mulsf3(0x00000001, 0x7f970eba, 0x7fd70eba); + status |= test__mulsf3(0x00000001, 0x7fc35716, 0x7fc35716); + status |= test__mulsf3(0x007fffff, 0x7fbf52d6, 0x7fff52d6); + status |= test__mulsf3(0x007fffff, 0x7fc7a2df, 0x7fc7a2df); + status |= test__mulsf3(0x3f800000, 0x7f987a85, 0x7fd87a85); + status |= test__mulsf3(0x3f800000, 0x7fc50124, 0x7fc50124); + status |= test__mulsf3(0x7f7fffff, 0x7f95fd6f, 0x7fd5fd6f); + status |= test__mulsf3(0x7f7fffff, 0x7ffc28dc, 0x7ffc28dc); + status |= test__mulsf3(0x7f800000, 0x00000000, 0x7fc00000); + status |= test__mulsf3(0x7f800000, 0x7f8dd790, 0x7fcdd790); + status |= test__mulsf3(0x7f800000, 0x7fd2ef2b, 0x7fd2ef2b); + status |= test__mulsf3(0x7f800000, 0x80000000, 0x7fc00000); + status |= test__mulsf3(0x7f99b09d, 0x00000000, 0x7fd9b09d); + status |= test__mulsf3(0x7f93541e, 0x00000001, 0x7fd3541e); + status |= test__mulsf3(0x7f9fc002, 0x007fffff, 0x7fdfc002); + status |= test__mulsf3(0x7fb5db77, 0x3f800000, 0x7ff5db77); + status |= test__mulsf3(0x7f9f5d92, 0x7f7fffff, 0x7fdf5d92); + status |= test__mulsf3(0x7fac7a36, 0x7f800000, 0x7fec7a36); + status |= test__mulsf3(0x7fb42008, 0x7fb0ee07, 0x7ff42008); + status |= test__mulsf3(0x7f8bd740, 0x7fc7aaf1, 0x7fcbd740); + status |= test__mulsf3(0x7f9bb57b, 0x80000000, 0x7fdbb57b); + status |= test__mulsf3(0x7f951a78, 0x80000001, 0x7fd51a78); + status |= test__mulsf3(0x7f9ba63b, 0x807fffff, 0x7fdba63b); + status |= test__mulsf3(0x7f89463c, 0xbf800000, 0x7fc9463c); + status |= test__mulsf3(0x7fb63563, 0xff7fffff, 0x7ff63563); + status |= test__mulsf3(0x7f90886e, 0xff800000, 0x7fd0886e); + status |= test__mulsf3(0x7fe8c15e, 0x00000000, 0x7fe8c15e); + status |= test__mulsf3(0x7fe915ae, 0x00000001, 0x7fe915ae); + status |= test__mulsf3(0x7ffa9b42, 0x007fffff, 0x7ffa9b42); + status |= test__mulsf3(0x7fdad0f5, 0x3f800000, 0x7fdad0f5); + status |= test__mulsf3(0x7fd10dcb, 0x7f7fffff, 0x7fd10dcb); + status |= test__mulsf3(0x7fd08e8a, 0x7f800000, 0x7fd08e8a); + status |= test__mulsf3(0x7fc3a9e6, 0x7f91a816, 0x7fd1a816); + status |= test__mulsf3(0x7fdb229c, 0x7fc26c68, 0x7fdb229c); + status |= test__mulsf3(0x7fc9f6bb, 0x80000000, 0x7fc9f6bb); + status |= test__mulsf3(0x7ffa178b, 0x80000001, 0x7ffa178b); + status |= test__mulsf3(0x7fef2a0b, 0x807fffff, 0x7fef2a0b); + status |= test__mulsf3(0x7ffc885b, 0xbf800000, 0x7ffc885b); + status |= test__mulsf3(0x7fd26e8c, 0xff7fffff, 0x7fd26e8c); + status |= test__mulsf3(0x7fc55329, 0xff800000, 0x7fc55329); + status |= test__mulsf3(0x80000000, 0x7f800000, 0x7fc00000); + status |= test__mulsf3(0x80000000, 0x7fa833ae, 0x7fe833ae); + status |= test__mulsf3(0x80000000, 0x7fc4df63, 0x7fc4df63); + status |= test__mulsf3(0x80000000, 0xff800000, 0x7fc00000); + status |= test__mulsf3(0x80000001, 0x7f98827d, 0x7fd8827d); + status |= test__mulsf3(0x80000001, 0x7fd7acc5, 0x7fd7acc5); + status |= test__mulsf3(0x807fffff, 0x7fad19c0, 0x7fed19c0); + status |= test__mulsf3(0x807fffff, 0x7ffe1907, 0x7ffe1907); + status |= test__mulsf3(0xbf800000, 0x7fa95487, 0x7fe95487); + status |= test__mulsf3(0xbf800000, 0x7fd2bbee, 0x7fd2bbee); + status |= test__mulsf3(0xff7fffff, 0x7f86ba21, 0x7fc6ba21); + status |= test__mulsf3(0xff7fffff, 0x7feb00d7, 0x7feb00d7); + status |= test__mulsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc); + status |= test__mulsf3(0xff800000, 0x7fde0397, 0x7fde0397); +#endif // ARM_NAN_HANDLING + + return status; +} From 2432465d99d5740bc335bcce50024878134fcc08 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 18 Nov 2025 11:31:11 +0000 Subject: [PATCH 31/35] [VPlan] Support isa/dyn_cast from VPRecipeBase to VPIRMetadata (NFC). (#166245) Implement CastInfo from VPRecipeBase to VPIRMetadata to support isa/dyn_Cast. This is similar to CastInfoVPPhiAccessors, supporting dyn_cast by down-casting to the concrete recipe types inheriting from VPIRMetadata. Can be used for more generalized VPIRMetadata printing following https://github.com/llvm/llvm-project/pull/165825. PR: https://github.com/llvm/llvm-project/pull/166245 --- llvm/lib/Transforms/Vectorize/VPlan.h | 69 +++++++++++++++++++ .../Transforms/Vectorize/VPlanTest.cpp | 27 ++++---- 2 files changed, 83 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index a88ddf217da9b..fc29ab0c84093 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3876,6 +3876,75 @@ template <> struct CastInfo : CastInfoVPPhiAccessors {}; +/// Casting from (const) VPRecipeBase -> (const) VPIRMetadata is supported for +/// all recipe types implementing VPIRMetadata. Used by isa<> & co. +namespace detail { +template +static inline auto castToVPIRMetadata(RecipeBasePtrTy R) -> DstTy { + switch (R->getVPDefID()) { + case VPDef::VPInstructionSC: + return cast(R); + case VPDef::VPWidenSC: + return cast(R); + case VPDef::VPWidenCastSC: + return cast(R); + case VPDef::VPWidenIntrinsicSC: + return cast(R); + case VPDef::VPWidenCallSC: + return cast(R); + case VPDef::VPWidenSelectSC: + return cast(R); + case VPDef::VPReplicateSC: + return cast(R); + case VPDef::VPInterleaveSC: + case VPDef::VPInterleaveEVLSC: + return cast(R); + case VPDef::VPWidenLoadSC: + case VPDef::VPWidenLoadEVLSC: + case VPDef::VPWidenStoreSC: + case VPDef::VPWidenStoreEVLSC: + return cast(R); + default: + llvm_unreachable("invalid recipe for VPIRMetadata cast"); + } +} +} // namespace detail + +/// Support casting from VPRecipeBase -> VPIRMetadata, by down-casting to the +/// recipe types implementing VPIRMetadata. Used by cast<>, dyn_cast<> & co. +template +struct CastInfoVPIRMetadata : public CastIsPossible { + static inline bool isPossible(SrcTy R) { + // NOTE: Each recipe inheriting from VPIRMetadata must be listed here and + // also handled in castToVPIRMetadata. + return isa(R); + } + + using RetTy = DstTy *; + + /// doCast is used by cast<>. + static inline RetTy doCast(SrcTy R) { + return detail::castToVPIRMetadata(R); + } + + /// doCastIfPossible is used by dyn_cast<>. + static inline RetTy doCastIfPossible(SrcTy R) { + if (!isPossible(R)) + return nullptr; + return doCast(R); + } +}; +template <> +struct CastInfo + : CastInfoVPIRMetadata {}; +template <> +struct CastInfo + : CastInfoVPIRMetadata {}; + /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It /// holds a sequence of zero or more VPRecipe's each representing a sequence of /// output IR instructions. All PHI-like recipes must come before any non-PHI recipes. diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 0e76c64f09f59..3842ba235ead3 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -996,7 +996,7 @@ TEST_F(VPRecipeTest, CastVPInstructionToVPUser) { VPValue *Op2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPInstruction Recipe(Instruction::Add, {Op1, Op2}); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); } TEST_F(VPRecipeTest, CastVPWidenRecipeToVPUser) { @@ -1011,7 +1011,7 @@ TEST_F(VPRecipeTest, CastVPWidenRecipeToVPUser) { Args.push_back(Op2); VPWidenRecipe WidenR(*AI, Args, VPIRMetadata(), DebugLoc()); - checkVPRecipeCastImpl(&WidenR); + checkVPRecipeCastImpl(&WidenR); delete AI; } @@ -1030,7 +1030,7 @@ TEST_F(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) { Args.push_back(CalledFn); VPWidenCallRecipe Recipe(Call, Fn, Args); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); VPValue *VPV = &Recipe; EXPECT_TRUE(VPV->getDefiningRecipe()); @@ -1056,7 +1056,8 @@ TEST_F(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) { VPWidenSelectRecipe WidenSelectR(*SelectI, make_range(Args.begin(), Args.end())); - checkVPRecipeCastImpl(&WidenSelectR); + checkVPRecipeCastImpl( + &WidenSelectR); VPValue *VPV = &WidenSelectR; EXPECT_EQ(&WidenSelectR, VPV->getDefiningRecipe()); @@ -1094,7 +1095,7 @@ TEST_F(VPRecipeTest, CastVPWidenCastRecipeToVPUser) { VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPWidenCastRecipe Recipe(Instruction::ZExt, Op1, Int64, *Cast, {}); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); delete Cast; } @@ -1105,7 +1106,7 @@ TEST_F(VPRecipeTest, CastVPWidenIntrinsicRecipeToVPUser) { VPValue *Op2 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPWidenIntrinsicRecipe Recipe(Intrinsic::smax, {Op1, Op2}, Int32); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); } TEST_F(VPRecipeTest, CastVPBlendRecipeToVPUser) { @@ -1135,7 +1136,7 @@ TEST_F(VPRecipeTest, CastVPInterleaveRecipeToVPUser) { InterleaveGroup IG(4, false, Align(4)); VPInterleaveRecipe Recipe(&IG, Addr, {}, Mask, false, {}, DebugLoc()); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); } TEST_F(VPRecipeTest, CastVPReplicateRecipeToVPUser) { @@ -1151,7 +1152,7 @@ TEST_F(VPRecipeTest, CastVPReplicateRecipeToVPUser) { auto *Call = CallInst::Create(FTy, PoisonValue::get(FTy)); VPReplicateRecipe Recipe(Call, make_range(Args.begin(), Args.end()), true); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); delete Call; } @@ -1175,7 +1176,7 @@ TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) { VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {}, {}); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); VPValue *VPV = Recipe.getVPSingleValue(); EXPECT_TRUE(isa(VPV->getDefiningRecipe())); @@ -1194,7 +1195,7 @@ TEST_F(VPRecipeTest, CastVPInterleaveEVLRecipeToVPUser) { VPInterleaveRecipe BaseRecipe(&IG, Addr, {}, Mask, false, {}, DebugLoc()); VPInterleaveEVLRecipe Recipe(BaseRecipe, *EVL, Mask); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); } TEST_F(VPRecipeTest, CastVPWidenLoadEVLRecipeToVPUser) { @@ -1209,7 +1210,7 @@ TEST_F(VPRecipeTest, CastVPWidenLoadEVLRecipeToVPUser) { VPWidenLoadRecipe BaseLoad(*Load, Addr, Mask, true, false, {}, {}); VPWidenLoadEVLRecipe Recipe(BaseLoad, Addr, *EVL, Mask); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); delete Load; } @@ -1225,7 +1226,7 @@ TEST_F(VPRecipeTest, CastVPWidenStoreRecipeToVPUser) { VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPWidenStoreRecipe Recipe(*Store, Addr, StoredVal, Mask, true, false, {}, {}); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); delete Store; } @@ -1244,7 +1245,7 @@ TEST_F(VPRecipeTest, CastVPWidenStoreEVLRecipeToVPUser) { {}); VPWidenStoreEVLRecipe Recipe(BaseStore, Addr, *EVL, Mask); - checkVPRecipeCastImpl(&Recipe); + checkVPRecipeCastImpl(&Recipe); delete Store; } From 27231bc28a1cedf3a8463eb455f4af639915a676 Mon Sep 17 00:00:00 2001 From: Hank <49036880+hankluo6@users.noreply.github.com> Date: Tue, 18 Nov 2025 03:31:46 -0800 Subject: [PATCH 32/35] [MLIR][SPIRV] Lower SPIR-V Tan/Tanh ops to LLVM intrinsics (#168419) Fixed #148354 Lower SPIR-V Tan/Tanh ops using the corresponding LLVM intrinsics to reduce instructions and prevent overflow caused by the previous `exp`-based expansion. --- .../Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp | 26 +++---------------- .../SPIRVToLLVM/gl-ops-to-llvm.mlir | 12 ++------- 2 files changed, 6 insertions(+), 32 deletions(-) diff --git a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp index 50fca564b5b64..02b61bd989368 100644 --- a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp +++ b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp @@ -1520,20 +1520,12 @@ class TanPattern : public SPIRVToLLVMConversion { if (!dstType) return rewriter.notifyMatchFailure(tanOp, "type conversion failed"); - Location loc = tanOp.getLoc(); - Value sin = LLVM::SinOp::create(rewriter, loc, dstType, tanOp.getOperand()); - Value cos = LLVM::CosOp::create(rewriter, loc, dstType, tanOp.getOperand()); - rewriter.replaceOpWithNewOp(tanOp, dstType, sin, cos); + rewriter.replaceOpWithNewOp(tanOp, dstType, + adaptor.getOperands()); return success(); } }; -/// Convert `spirv.Tanh` to -/// -/// exp(2x) - 1 -/// ----------- -/// exp(2x) + 1 -/// class TanhPattern : public SPIRVToLLVMConversion { public: using SPIRVToLLVMConversion::SPIRVToLLVMConversion; @@ -1546,18 +1538,8 @@ class TanhPattern : public SPIRVToLLVMConversion { if (!dstType) return rewriter.notifyMatchFailure(tanhOp, "type conversion failed"); - Location loc = tanhOp.getLoc(); - Value two = createFPConstant(loc, srcType, dstType, rewriter, 2.0); - Value multiplied = - LLVM::FMulOp::create(rewriter, loc, dstType, two, tanhOp.getOperand()); - Value exponential = LLVM::ExpOp::create(rewriter, loc, dstType, multiplied); - Value one = createFPConstant(loc, srcType, dstType, rewriter, 1.0); - Value numerator = - LLVM::FSubOp::create(rewriter, loc, dstType, exponential, one); - Value denominator = - LLVM::FAddOp::create(rewriter, loc, dstType, exponential, one); - rewriter.replaceOpWithNewOp(tanhOp, dstType, numerator, - denominator); + rewriter.replaceOpWithNewOp(tanhOp, dstType, + adaptor.getOperands()); return success(); } }; diff --git a/mlir/test/Conversion/SPIRVToLLVM/gl-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/gl-ops-to-llvm.mlir index e1936e2fd8abe..b17e1c40cb9a7 100644 --- a/mlir/test/Conversion/SPIRVToLLVM/gl-ops-to-llvm.mlir +++ b/mlir/test/Conversion/SPIRVToLLVM/gl-ops-to-llvm.mlir @@ -162,9 +162,7 @@ spirv.func @sqrt(%arg0: f32, %arg1: vector<3xf16>) "None" { // CHECK-LABEL: @tan spirv.func @tan(%arg0: f32) "None" { - // CHECK: %[[SIN:.*]] = llvm.intr.sin(%{{.*}}) : (f32) -> f32 - // CHECK: %[[COS:.*]] = llvm.intr.cos(%{{.*}}) : (f32) -> f32 - // CHECK: llvm.fdiv %[[SIN]], %[[COS]] : f32 + // CHECK: llvm.intr.tan(%{{.*}}) : (f32) -> f32 %0 = spirv.GL.Tan %arg0 : f32 spirv.Return } @@ -175,13 +173,7 @@ spirv.func @tan(%arg0: f32) "None" { // CHECK-LABEL: @tanh spirv.func @tanh(%arg0: f32) "None" { - // CHECK: %[[TWO:.*]] = llvm.mlir.constant(2.000000e+00 : f32) : f32 - // CHECK: %[[X2:.*]] = llvm.fmul %[[TWO]], %{{.*}} : f32 - // CHECK: %[[EXP:.*]] = llvm.intr.exp(%[[X2]]) : (f32) -> f32 - // CHECK: %[[ONE:.*]] = llvm.mlir.constant(1.000000e+00 : f32) : f32 - // CHECK: %[[T0:.*]] = llvm.fsub %[[EXP]], %[[ONE]] : f32 - // CHECK: %[[T1:.*]] = llvm.fadd %[[EXP]], %[[ONE]] : f32 - // CHECK: llvm.fdiv %[[T0]], %[[T1]] : f32 + // CHECK: llvm.intr.tanh(%{{.*}}) : (f32) -> f32 %0 = spirv.GL.Tanh %arg0 : f32 spirv.Return } From 591c463e0754fe67f77ba72c0dd2b2b2416dcdd0 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Tue, 18 Nov 2025 11:37:51 +0000 Subject: [PATCH 33/35] [LLVM][AArch64] Mark SVE integer intrinsics as speculatable. (#167915) Exceptions include intrinsics that: * take or return floating point data * read or write FFR * read or write memory * read or write SME state --- llvm/include/llvm/IR/IntrinsicsAArch64.td | 1289 ++++++++--------- .../aarch64-intrinsics-attributes.ll | 3 +- .../AArch64/speculative-intrinsic-hoisting.ll | 73 + 3 files changed, 719 insertions(+), 646 deletions(-) create mode 100644 llvm/test/Transforms/LICM/AArch64/speculative-intrinsic-hoisting.ll diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index c84c158c57b8e..77fdb8295faa8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -126,8 +126,8 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; class AdvSIMD_1FloatArg_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; - class AdvSIMD_1VectorArg_Intrinsic - : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>; + class AdvSIMD_1VectorArg_Intrinsic Attrs = []> + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], !listconcat(Attrs, [IntrNoMem])>; class AdvSIMD_1VectorArg_Expand_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>; class AdvSIMD_1IntArg_Narrow_Intrinsic @@ -145,9 +145,9 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". class AdvSIMD_2FloatArg_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - class AdvSIMD_2VectorArg_Intrinsic + class AdvSIMD_2VectorArg_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; class AdvSIMD_2Arg_FloatCompare_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>], [IntrNoMem]>; @@ -175,15 +175,14 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; - class AdvSIMD_3IntArg_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - class AdvSIMD_3VectorArg_Intrinsic + class AdvSIMD_3VectorArg_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; class AdvSIMD_3VectorArg_Scalar_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], @@ -1095,124 +1094,124 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_ptr_ty], [IntrArgMemOnly, NoCapture>]>; - class AdvSIMD_SVE_Index_Intrinsic + class AdvSIMD_SVE_Index_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMVectorElementType<0>, LLVMVectorElementType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_Merged1VectorArg_Intrinsic + class AdvSIMD_Merged1VectorArg_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_2VectorArgIndexed_Intrinsic + class AdvSIMD_2VectorArgIndexed_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class AdvSIMD_3VectorArgIndexed_Intrinsic + class AdvSIMD_3VectorArgIndexed_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class AdvSIMD_Pred1VectorArg_Intrinsic + class AdvSIMD_Pred1VectorArg_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_Pred2VectorArg_Intrinsic + class AdvSIMD_Pred2VectorArg_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_Pred3VectorArg_Intrinsic + class AdvSIMD_Pred3VectorArg_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_Compare_Intrinsic + class AdvSIMD_SVE_Compare_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyvector_ty, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_CompareWide_Intrinsic + class AdvSIMD_SVE_CompareWide_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyvector_ty, llvm_nxv2i64_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_Saturating_Intrinsic + class AdvSIMD_SVE_Saturating_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_SaturatingWithPattern_Intrinsic + class AdvSIMD_SVE_SaturatingWithPattern_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, ImmArg>, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>, ImmArg>])>; - class AdvSIMD_SVE_Saturating_N_Intrinsic + class AdvSIMD_SVE_Saturating_N_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[T], [T, llvm_anyvector_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic + class AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[T], [T, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, ImmArg>, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>, ImmArg>])>; - class AdvSIMD_SVE_CNT_Intrinsic + class AdvSIMD_SVE_CNT_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMVectorOfBitcastsToInt<0>], [LLVMVectorOfBitcastsToInt<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyvector_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_ReduceWithInit_Intrinsic + class AdvSIMD_SVE_ReduceWithInit_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMVectorElementType<0>, llvm_anyvector_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_ShiftByImm_Intrinsic + class AdvSIMD_SVE_ShiftByImm_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class AdvSIMD_SVE_ShiftWide_Intrinsic + class AdvSIMD_SVE_ShiftWide_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>, llvm_nxv2i64_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_Unpack_Intrinsic + class AdvSIMD_SVE_Unpack_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMSubdivide2VectorType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; class AdvSIMD_SVE_CADD_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], @@ -1231,31 +1230,31 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". llvm_i32_ty], [IntrNoMem, ImmArg>]>; - class AdvSIMD_SVE_CMLA_LANE_Intrinsic + class AdvSIMD_SVE_CMLA_LANE_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, ImmArg>, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>, ImmArg>])>; - class AdvSIMD_SVE_DUP_Intrinsic + class AdvSIMD_SVE_DUP_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMVectorElementType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_DUP_Unpred_Intrinsic + class AdvSIMD_SVE_DUP_Unpred_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMVectorElementType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_DUPQ_Intrinsic + class AdvSIMD_SVE_DUPQ_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_i64_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; class AdvSIMD_SVE_EXPA_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], @@ -1276,21 +1275,21 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". llvm_anyvector_ty], [IntrNoMem]>; - class AdvSIMD_SVE_INSR_Intrinsic + class AdvSIMD_SVE_INSR_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMVectorElementType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; class AdvSIMD_SVE_PTRUE_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_i32_ty], [IntrNoMem, ImmArg>]>; - class AdvSIMD_SVE_PUNPKHI_Intrinsic + class AdvSIMD_SVE_PUNPKHI_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMOneNthElementsVectorType<0, 2>], [llvm_anyvector_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; class AdvSIMD_SVE_SCALE_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], @@ -1312,191 +1311,192 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". LLVMVectorOfBitcastsToInt<0>], [IntrNoMem]>; - class AdvSIMD_SVE_CNTB_Intrinsic + class AdvSIMD_SVE_CNTB_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class AdvSIMD_SVE_CNTP_Intrinsic + class AdvSIMD_SVE_CNTP_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_anyvector_ty, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_DOT_Intrinsic + class AdvSIMD_SVE_DOT_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMSubdivide4VectorType<0>, LLVMSubdivide4VectorType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_DOT_Indexed_Intrinsic + class AdvSIMD_SVE_DOT_Indexed_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMSubdivide4VectorType<0>, LLVMSubdivide4VectorType<0>, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class AdvSIMD_SVE_PTEST_Intrinsic + class AdvSIMD_SVE_PTEST_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_anyvector_ty, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE_TBL_Intrinsic + class AdvSIMD_SVE_TBL_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMVectorOfBitcastsToInt<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class AdvSIMD_SVE2_TBX_Intrinsic + class AdvSIMD_SVE2_TBX_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMVectorOfBitcastsToInt<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class SVE2_LUTI_Inrinsic + class SVE2_LUTI_Inrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_nxv16i8_ty, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class SVE2_1VectorArg_Long_Intrinsic + class SVE2_1VectorArg_Long_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMSubdivide2VectorType<0>, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class SVE2_2VectorArg_Long_Intrinsic + class SVE2_2VectorArg_Long_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class SVE2_2VectorArgIndexed_Long_Intrinsic + class SVE2_2VectorArgIndexed_Long_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class SVE2_2VectorArg_Wide_Intrinsic + class SVE2_2VectorArg_Wide_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMSubdivide2VectorType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class SVE2_2VectorArg_Pred_Long_Intrinsic + class SVE2_2VectorArg_Pred_Long_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>, LLVMSubdivide2VectorType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class SVE2_3VectorArg_Long_Intrinsic + class SVE2_3VectorArg_Long_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class SVE2_3VectorArgIndexed_Long_Intrinsic + class SVE2_3VectorArgIndexed_Long_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class SVE2_1VectorArg_Narrowing_Intrinsic + class SVE2_1VectorArg_Narrowing_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>], [llvm_anyvector_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class SVE2_Merged1VectorArg_Narrowing_Intrinsic + class SVE2_Merged1VectorArg_Narrowing_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>], [LLVMSubdivide2VectorType<0>, llvm_anyvector_ty], - [IntrNoMem]>; - class SVE2_2VectorArg_Narrowing_Intrinsic + !listconcat(Attrs, [IntrNoMem])>; + + class SVE2_2VectorArg_Narrowing_Intrinsic Attrs = []> : DefaultAttrsIntrinsic< [LLVMSubdivide2VectorType<0>], [llvm_anyvector_ty, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class SVE2_Merged2VectorArg_Narrowing_Intrinsic + class SVE2_Merged2VectorArg_Narrowing_Intrinsic Attrs = []> : DefaultAttrsIntrinsic< [LLVMSubdivide2VectorType<0>], [LLVMSubdivide2VectorType<0>, llvm_anyvector_ty, LLVMMatchType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class SVE2_1VectorArg_Imm_Narrowing_Intrinsic + class SVE2_1VectorArg_Imm_Narrowing_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>], [llvm_anyvector_ty, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class SVE2_2VectorArg_Imm_Narrowing_Intrinsic + class SVE2_2VectorArg_Imm_Narrowing_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>], [LLVMSubdivide2VectorType<0>, llvm_anyvector_ty, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class SVE2_CONFLICT_DETECT_Intrinsic + class SVE2_CONFLICT_DETECT_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyptr_ty, LLVMMatchType<1>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class SVE2_3VectorArg_Indexed_Intrinsic + class SVE2_3VectorArg_Indexed_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class SVE2_1VectorArgIndexed_Intrinsic + class SVE2_1VectorArgIndexed_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class AdvSIMD_SVE_CDOT_LANE_Intrinsic + class AdvSIMD_SVE_CDOT_LANE_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMSubdivide4VectorType<0>, LLVMSubdivide4VectorType<0>, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, ImmArg>, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>, ImmArg>])>; - class SVE2_1VectorArg_Pred_Intrinsic + class SVE2_1VectorArg_Pred_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [llvm_anyvector_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; - class SVE2_1VectorArgIndexed_Pred_Intrinsic + class SVE2_1VectorArgIndexed_Pred_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [llvm_anyvector_ty, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class SVE2_Pred_1VectorArgIndexed_Intrinsic + class SVE2_Pred_1VectorArgIndexed_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + !listconcat(Attrs, [IntrNoMem, ImmArg>])>; - class SVE2_Pred_1VectorArg_Intrinsic + class SVE2_Pred_1VectorArg_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; // NOTE: There is no relationship between these intrinsics beyond an attempt // to reuse currently identical class definitions. - class AdvSIMD_SVE_LOGB_Intrinsic : AdvSIMD_SVE_CNT_Intrinsic; - class AdvSIMD_SVE2_CADD_Intrinsic : AdvSIMD_2VectorArgIndexed_Intrinsic; - class AdvSIMD_SVE2_CMLA_Intrinsic : AdvSIMD_3VectorArgIndexed_Intrinsic; + class AdvSIMD_SVE_LOGB_Intrinsic : AdvSIMD_SVE_CNT_Intrinsic; + class AdvSIMD_SVE2_CADD_Intrinsic Attrs = []> : AdvSIMD_2VectorArgIndexed_Intrinsic; + class AdvSIMD_SVE2_CMLA_Intrinsic Attrs = []> : AdvSIMD_3VectorArgIndexed_Intrinsic; // This class of intrinsics are not intended to be useful within LLVM IR but // are instead here to support some of the more regid parts of the ACLE. @@ -1509,39 +1509,39 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". -class AdvSIMD_SVE_2SVBoolArg_Intrinsic +class AdvSIMD_SVE_2SVBoolArg_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_nxv16i1_ty], [llvm_nxv16i1_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; -class AdvSIMD_SVE_3SVBoolArg_Intrinsic +class AdvSIMD_SVE_3SVBoolArg_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_nxv16i1_ty], [llvm_nxv16i1_ty, llvm_nxv16i1_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; -class AdvSIMD_SVE_Reduce_Intrinsic +class AdvSIMD_SVE_Reduce_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyvector_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; -class AdvSIMD_SVE_V128_Reduce_Intrinsic +class AdvSIMD_SVE_V128_Reduce_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<1, llvm_i1_ty>, llvm_anyvector_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; -class AdvSIMD_SVE_SADDV_Reduce_Intrinsic +class AdvSIMD_SVE_SADDV_Reduce_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_i64_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyvector_ty], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; -class AdvSIMD_SVE_WHILE_Intrinsic +class AdvSIMD_SVE_WHILE_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyint_ty, LLVMMatchType<1>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; class AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], @@ -1684,10 +1684,10 @@ class SVE_gather_prf_VS ], [IntrInaccessibleMemOrArgMemOnly, ImmArg>]>; -class SVE_MatMul_Intrinsic +class SVE_MatMul_Intrinsic Attrs = []> : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMSubdivide4VectorType<0>, LLVMSubdivide4VectorType<0>], - [IntrNoMem]>; + !listconcat(Attrs, [IntrNoMem])>; class SVE_4Vec_BF16 : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty], @@ -1765,159 +1765,158 @@ def int_aarch64_sve_prfd_gather_scalar_offset : SVE_gather_prf_VS; // Scalar to vector operations // -def int_aarch64_sve_dup : AdvSIMD_SVE_DUP_Intrinsic; -def int_aarch64_sve_dup_x : AdvSIMD_SVE_DUP_Unpred_Intrinsic; +def int_aarch64_sve_dup : AdvSIMD_SVE_DUP_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_dup_x : AdvSIMD_SVE_DUP_Unpred_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_index : AdvSIMD_SVE_Index_Intrinsic; +def int_aarch64_sve_index : AdvSIMD_SVE_Index_Intrinsic<[IntrSpeculatable]>; // // Address calculation // -def int_aarch64_sve_adrb : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_adrh : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_adrw : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_adrd : AdvSIMD_2VectorArg_Intrinsic; +def int_aarch64_sve_adrb : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_adrh : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_adrw : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_adrd : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; // // Integer arithmetic // -def int_aarch64_sve_add : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_add_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sub : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sub_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_subr : AdvSIMD_Pred2VectorArg_Intrinsic; - -def int_aarch64_sve_pmul : AdvSIMD_2VectorArg_Intrinsic; - -def int_aarch64_sve_mul : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_mul_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_mul_lane : AdvSIMD_2VectorArgIndexed_Intrinsic; -def int_aarch64_sve_smulh : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_smulh_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_umulh : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_umulh_u : AdvSIMD_Pred2VectorArg_Intrinsic; - -def int_aarch64_sve_sdiv : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sdiv_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_udiv : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_udiv_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sdivr : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_udivr : AdvSIMD_Pred2VectorArg_Intrinsic; - -def int_aarch64_sve_smax : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_smax_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_umax : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_umax_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_smin : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_smin_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_umin : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_umin_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sabd : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sabd_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uabd : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uabd_u : AdvSIMD_Pred2VectorArg_Intrinsic; - -def int_aarch64_sve_mad : AdvSIMD_Pred3VectorArg_Intrinsic; -def int_aarch64_sve_msb : AdvSIMD_Pred3VectorArg_Intrinsic; -def int_aarch64_sve_mla : AdvSIMD_Pred3VectorArg_Intrinsic; -def int_aarch64_sve_mla_u : AdvSIMD_Pred3VectorArg_Intrinsic; -def int_aarch64_sve_mla_lane : AdvSIMD_3VectorArgIndexed_Intrinsic; -def int_aarch64_sve_mls : AdvSIMD_Pred3VectorArg_Intrinsic; -def int_aarch64_sve_mls_u : AdvSIMD_Pred3VectorArg_Intrinsic; -def int_aarch64_sve_mls_lane : AdvSIMD_3VectorArgIndexed_Intrinsic; - -def int_aarch64_sve_saddv : AdvSIMD_SVE_SADDV_Reduce_Intrinsic; -def int_aarch64_sve_uaddv : AdvSIMD_SVE_SADDV_Reduce_Intrinsic; - -def int_aarch64_sve_smaxv : AdvSIMD_SVE_Reduce_Intrinsic; -def int_aarch64_sve_umaxv : AdvSIMD_SVE_Reduce_Intrinsic; -def int_aarch64_sve_sminv : AdvSIMD_SVE_Reduce_Intrinsic; -def int_aarch64_sve_uminv : AdvSIMD_SVE_Reduce_Intrinsic; - -def int_aarch64_sve_orv : AdvSIMD_SVE_Reduce_Intrinsic; -def int_aarch64_sve_eorv : AdvSIMD_SVE_Reduce_Intrinsic; -def int_aarch64_sve_andv : AdvSIMD_SVE_Reduce_Intrinsic; - -def int_aarch64_sve_abs : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_neg : AdvSIMD_Merged1VectorArg_Intrinsic; - -def int_aarch64_sve_sdot : AdvSIMD_SVE_DOT_Intrinsic; -def int_aarch64_sve_sdot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic; - -def int_aarch64_sve_udot : AdvSIMD_SVE_DOT_Intrinsic; -def int_aarch64_sve_udot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic; - -def int_aarch64_sve_sqadd_x : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_sqsub_x : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_uqadd_x : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_uqsub_x : AdvSIMD_2VectorArg_Intrinsic; - -def int_aarch64_sve_orqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; -def int_aarch64_sve_eorqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; -def int_aarch64_sve_andqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; -def int_aarch64_sve_addqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; -def int_aarch64_sve_smaxqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; -def int_aarch64_sve_umaxqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; -def int_aarch64_sve_sminqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; -def int_aarch64_sve_uminqv : AdvSIMD_SVE_V128_Reduce_Intrinsic; - +def int_aarch64_sve_add : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_add_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sub : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sub_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_subr : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_pmul : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_mul : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_mul_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_mul_lane : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smulh : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smulh_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umulh : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umulh_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_sdiv : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sdiv_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_udiv : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_udiv_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sdivr : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_udivr : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_smax : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smax_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umax : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umax_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smin : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smin_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umin : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umin_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sabd : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sabd_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uabd : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uabd_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_mad : AdvSIMD_Pred3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_msb : AdvSIMD_Pred3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_mla : AdvSIMD_Pred3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_mla_u : AdvSIMD_Pred3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_mla_lane : AdvSIMD_3VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_mls : AdvSIMD_Pred3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_mls_u : AdvSIMD_Pred3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_mls_lane : AdvSIMD_3VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_saddv : AdvSIMD_SVE_SADDV_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uaddv : AdvSIMD_SVE_SADDV_Reduce_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_smaxv : AdvSIMD_SVE_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umaxv : AdvSIMD_SVE_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sminv : AdvSIMD_SVE_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uminv : AdvSIMD_SVE_Reduce_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_orv : AdvSIMD_SVE_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_eorv : AdvSIMD_SVE_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_andv : AdvSIMD_SVE_Reduce_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_abs : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_neg : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_sdot : AdvSIMD_SVE_DOT_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sdot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_udot : AdvSIMD_SVE_DOT_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_udot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_sqadd_x : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqsub_x : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqadd_x : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqsub_x : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_orqv : AdvSIMD_SVE_V128_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_eorqv : AdvSIMD_SVE_V128_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_andqv : AdvSIMD_SVE_V128_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_addqv : AdvSIMD_SVE_V128_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smaxqv : AdvSIMD_SVE_V128_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umaxqv : AdvSIMD_SVE_V128_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sminqv : AdvSIMD_SVE_V128_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uminqv : AdvSIMD_SVE_V128_Reduce_Intrinsic<[IntrSpeculatable]>; // Shifts -def int_aarch64_sve_asr : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_asr_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_asr_wide : AdvSIMD_SVE_ShiftWide_Intrinsic; -def int_aarch64_sve_asrd : AdvSIMD_SVE_ShiftByImm_Intrinsic; -def int_aarch64_sve_insr : AdvSIMD_SVE_INSR_Intrinsic; -def int_aarch64_sve_lsl : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_lsl_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_lsl_wide : AdvSIMD_SVE_ShiftWide_Intrinsic; -def int_aarch64_sve_lsr : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_lsr_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_lsr_wide : AdvSIMD_SVE_ShiftWide_Intrinsic; +def int_aarch64_sve_asr : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_asr_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_asr_wide : AdvSIMD_SVE_ShiftWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_asrd : AdvSIMD_SVE_ShiftByImm_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_insr : AdvSIMD_SVE_INSR_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_lsl : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_lsl_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_lsl_wide : AdvSIMD_SVE_ShiftWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_lsr : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_lsr_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_lsr_wide : AdvSIMD_SVE_ShiftWide_Intrinsic<[IntrSpeculatable]>; // // Integer comparisons // -def int_aarch64_sve_cmpeq : AdvSIMD_SVE_Compare_Intrinsic; -def int_aarch64_sve_cmpge : AdvSIMD_SVE_Compare_Intrinsic; -def int_aarch64_sve_cmpgt : AdvSIMD_SVE_Compare_Intrinsic; -def int_aarch64_sve_cmphi : AdvSIMD_SVE_Compare_Intrinsic; -def int_aarch64_sve_cmphs : AdvSIMD_SVE_Compare_Intrinsic; -def int_aarch64_sve_cmpne : AdvSIMD_SVE_Compare_Intrinsic; +def int_aarch64_sve_cmpeq : AdvSIMD_SVE_Compare_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmpge : AdvSIMD_SVE_Compare_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmpgt : AdvSIMD_SVE_Compare_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmphi : AdvSIMD_SVE_Compare_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmphs : AdvSIMD_SVE_Compare_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmpne : AdvSIMD_SVE_Compare_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_cmpeq_wide : AdvSIMD_SVE_CompareWide_Intrinsic; -def int_aarch64_sve_cmpge_wide : AdvSIMD_SVE_CompareWide_Intrinsic; -def int_aarch64_sve_cmpgt_wide : AdvSIMD_SVE_CompareWide_Intrinsic; -def int_aarch64_sve_cmphi_wide : AdvSIMD_SVE_CompareWide_Intrinsic; -def int_aarch64_sve_cmphs_wide : AdvSIMD_SVE_CompareWide_Intrinsic; -def int_aarch64_sve_cmple_wide : AdvSIMD_SVE_CompareWide_Intrinsic; -def int_aarch64_sve_cmplo_wide : AdvSIMD_SVE_CompareWide_Intrinsic; -def int_aarch64_sve_cmpls_wide : AdvSIMD_SVE_CompareWide_Intrinsic; -def int_aarch64_sve_cmplt_wide : AdvSIMD_SVE_CompareWide_Intrinsic; -def int_aarch64_sve_cmpne_wide : AdvSIMD_SVE_CompareWide_Intrinsic; +def int_aarch64_sve_cmpeq_wide : AdvSIMD_SVE_CompareWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmpge_wide : AdvSIMD_SVE_CompareWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmpgt_wide : AdvSIMD_SVE_CompareWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmphi_wide : AdvSIMD_SVE_CompareWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmphs_wide : AdvSIMD_SVE_CompareWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmple_wide : AdvSIMD_SVE_CompareWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmplo_wide : AdvSIMD_SVE_CompareWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmpls_wide : AdvSIMD_SVE_CompareWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmplt_wide : AdvSIMD_SVE_CompareWide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmpne_wide : AdvSIMD_SVE_CompareWide_Intrinsic<[IntrSpeculatable]>; // // Counting bits // -def int_aarch64_sve_cls : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_clz : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_cnt : AdvSIMD_SVE_CNT_Intrinsic; +def int_aarch64_sve_cls : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_clz : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cnt : AdvSIMD_SVE_CNT_Intrinsic<[IntrSpeculatable]>; // // Counting elements // -def int_aarch64_sve_cntb : AdvSIMD_SVE_CNTB_Intrinsic; -def int_aarch64_sve_cnth : AdvSIMD_SVE_CNTB_Intrinsic; -def int_aarch64_sve_cntw : AdvSIMD_SVE_CNTB_Intrinsic; -def int_aarch64_sve_cntd : AdvSIMD_SVE_CNTB_Intrinsic; +def int_aarch64_sve_cntb : AdvSIMD_SVE_CNTB_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cnth : AdvSIMD_SVE_CNTB_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cntw : AdvSIMD_SVE_CNTB_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cntd : AdvSIMD_SVE_CNTB_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_cntp : AdvSIMD_SVE_CNTP_Intrinsic; +def int_aarch64_sve_cntp : AdvSIMD_SVE_CNTP_Intrinsic<[IntrSpeculatable]>; // // FFR manipulation @@ -1932,173 +1931,173 @@ def int_aarch64_sve_wrffr : ClangBuiltin<"__builtin_sve_svwrffr">, DefaultAt // Saturating scalar arithmetic // -def int_aarch64_sve_sqdech : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_sqdecw : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_sqdecd : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_sqdecp : AdvSIMD_SVE_Saturating_Intrinsic; - -def int_aarch64_sve_sqdecb_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqdecb_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqdech_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqdech_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqdecw_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqdecw_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqdecd_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqdecd_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqdecp_n32 : AdvSIMD_SVE_Saturating_N_Intrinsic; -def int_aarch64_sve_sqdecp_n64 : AdvSIMD_SVE_Saturating_N_Intrinsic; - -def int_aarch64_sve_sqinch : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_sqincw : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_sqincd : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_sqincp : AdvSIMD_SVE_Saturating_Intrinsic; - -def int_aarch64_sve_sqincb_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqincb_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqinch_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqinch_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqincw_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqincw_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqincd_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqincd_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_sqincp_n32 : AdvSIMD_SVE_Saturating_N_Intrinsic; -def int_aarch64_sve_sqincp_n64 : AdvSIMD_SVE_Saturating_N_Intrinsic; - -def int_aarch64_sve_uqdech : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_uqdecw : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_uqdecd : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_uqdecp : AdvSIMD_SVE_Saturating_Intrinsic; - -def int_aarch64_sve_uqdecb_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqdecb_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqdech_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqdech_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqdecw_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqdecw_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqdecd_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqdecd_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqdecp_n32 : AdvSIMD_SVE_Saturating_N_Intrinsic; -def int_aarch64_sve_uqdecp_n64 : AdvSIMD_SVE_Saturating_N_Intrinsic; - -def int_aarch64_sve_uqinch : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_uqincw : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_uqincd : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic; -def int_aarch64_sve_uqincp : AdvSIMD_SVE_Saturating_Intrinsic; - -def int_aarch64_sve_uqincb_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqincb_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqinch_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqinch_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqincw_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqincw_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqincd_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqincd_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; -def int_aarch64_sve_uqincp_n32 : AdvSIMD_SVE_Saturating_N_Intrinsic; -def int_aarch64_sve_uqincp_n64 : AdvSIMD_SVE_Saturating_N_Intrinsic; +def int_aarch64_sve_sqdech : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdecw : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdecd : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdecp : AdvSIMD_SVE_Saturating_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_sqdecb_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqdecb_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqdech_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqdech_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqdecw_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqdecw_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqdecd_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqdecd_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqdecp_n32 : AdvSIMD_SVE_Saturating_N_Intrinsic; +def int_aarch64_sve_sqdecp_n64 : AdvSIMD_SVE_Saturating_N_Intrinsic; + +def int_aarch64_sve_sqinch : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqincw : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqincd : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqincp : AdvSIMD_SVE_Saturating_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_sqincb_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqincb_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqinch_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqinch_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqincw_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqincw_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqincd_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqincd_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_sqincp_n32 : AdvSIMD_SVE_Saturating_N_Intrinsic; +def int_aarch64_sve_sqincp_n64 : AdvSIMD_SVE_Saturating_N_Intrinsic; + +def int_aarch64_sve_uqdech : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqdecw : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqdecd : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqdecp : AdvSIMD_SVE_Saturating_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_uqdecb_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqdecb_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqdech_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqdech_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqdecw_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqdecw_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqdecd_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqdecd_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqdecp_n32 : AdvSIMD_SVE_Saturating_N_Intrinsic; +def int_aarch64_sve_uqdecp_n64 : AdvSIMD_SVE_Saturating_N_Intrinsic; + +def int_aarch64_sve_uqinch : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqincw : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqincd : AdvSIMD_SVE_SaturatingWithPattern_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqincp : AdvSIMD_SVE_Saturating_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_uqincb_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqincb_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqinch_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqinch_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqincw_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqincw_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqincd_n32 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqincd_n64 : AdvSIMD_SVE_SaturatingWithPattern_N_Intrinsic; +def int_aarch64_sve_uqincp_n32 : AdvSIMD_SVE_Saturating_N_Intrinsic; +def int_aarch64_sve_uqincp_n64 : AdvSIMD_SVE_Saturating_N_Intrinsic; // // Reversal // -def int_aarch64_sve_rbit : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_revb : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_revh : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_revw : AdvSIMD_Merged1VectorArg_Intrinsic; +def int_aarch64_sve_rbit : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_revb : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_revh : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_revw : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; // // Permutations and selection // -def int_aarch64_sve_clasta : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_clasta_n : AdvSIMD_SVE_ReduceWithInit_Intrinsic; -def int_aarch64_sve_clastb : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_clastb_n : AdvSIMD_SVE_ReduceWithInit_Intrinsic; -def int_aarch64_sve_compact : AdvSIMD_Pred1VectorArg_Intrinsic; -def int_aarch64_sve_dupq_lane : AdvSIMD_SVE_DUPQ_Intrinsic; -def int_aarch64_sve_dup_laneq : SVE2_1VectorArgIndexed_Intrinsic; -def int_aarch64_sve_ext : AdvSIMD_2VectorArgIndexed_Intrinsic; -def int_aarch64_sve_sel : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_lasta : AdvSIMD_SVE_Reduce_Intrinsic; -def int_aarch64_sve_lastb : AdvSIMD_SVE_Reduce_Intrinsic; -def int_aarch64_sve_rev : AdvSIMD_1VectorArg_Intrinsic; -def int_aarch64_sve_rev_b16 : AdvSIMD_SVE_2SVBoolArg_Intrinsic; -def int_aarch64_sve_rev_b32 : AdvSIMD_SVE_2SVBoolArg_Intrinsic; -def int_aarch64_sve_rev_b64 : AdvSIMD_SVE_2SVBoolArg_Intrinsic; -def int_aarch64_sve_splice : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sunpkhi : AdvSIMD_SVE_Unpack_Intrinsic; -def int_aarch64_sve_sunpklo : AdvSIMD_SVE_Unpack_Intrinsic; -def int_aarch64_sve_tbl : AdvSIMD_SVE_TBL_Intrinsic; -def int_aarch64_sve_trn1 : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_trn1_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_trn1_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_trn1_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_trn2 : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_trn2_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_trn2_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_trn2_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_trn1q : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_trn2q : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_uunpkhi : AdvSIMD_SVE_Unpack_Intrinsic; -def int_aarch64_sve_uunpklo : AdvSIMD_SVE_Unpack_Intrinsic; -def int_aarch64_sve_uzp1 : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_uzp1_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_uzp1_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_uzp1_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_uzp2 : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_uzp2_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_uzp2_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_uzp2_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_uzp1q : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_uzp2q : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_zip1 : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_zip1_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_zip1_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_zip1_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_zip2 : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_zip2_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_zip2_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_zip2_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic; -def int_aarch64_sve_zip1q : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_zip2q : AdvSIMD_2VectorArg_Intrinsic; +def int_aarch64_sve_clasta : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_clasta_n : AdvSIMD_SVE_ReduceWithInit_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_clastb : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_clastb_n : AdvSIMD_SVE_ReduceWithInit_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_compact : AdvSIMD_Pred1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_dupq_lane : AdvSIMD_SVE_DUPQ_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_dup_laneq : SVE2_1VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ext : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sel : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_lasta : AdvSIMD_SVE_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_lastb : AdvSIMD_SVE_Reduce_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_rev : AdvSIMD_1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_rev_b16 : AdvSIMD_SVE_2SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_rev_b32 : AdvSIMD_SVE_2SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_rev_b64 : AdvSIMD_SVE_2SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_splice : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sunpkhi : AdvSIMD_SVE_Unpack_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sunpklo : AdvSIMD_SVE_Unpack_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_tbl : AdvSIMD_SVE_TBL_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_trn1 : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_trn1_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_trn1_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_trn1_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_trn2 : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_trn2_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_trn2_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_trn2_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_trn1q : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_trn2q : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uunpkhi : AdvSIMD_SVE_Unpack_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uunpklo : AdvSIMD_SVE_Unpack_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzp1 : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzp1_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzp1_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzp1_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzp2 : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzp2_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzp2_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzp2_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzp1q : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzp2q : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zip1 : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zip1_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zip1_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zip1_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zip2 : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zip2_b16 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zip2_b32 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zip2_b64 : AdvSIMD_SVE_3SVBoolArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zip1q : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zip2q : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; // // Logical operations // -def int_aarch64_sve_and : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_and_u: AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_bic : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_bic_u: AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_cnot : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_eor : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_eor_u: AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_not : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_orr : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_orr_u: AdvSIMD_Pred2VectorArg_Intrinsic; +def int_aarch64_sve_and : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_and_u: AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_bic : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_bic_u: AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cnot : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_eor : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_eor_u: AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_not : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_orr : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_orr_u: AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; // // Conversion // -def int_aarch64_sve_sxtb : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_sxth : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_sxtw : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_uxtb : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_uxth : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_uxtw : AdvSIMD_Merged1VectorArg_Intrinsic; +def int_aarch64_sve_sxtb : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sxth : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sxtw : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uxtb : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uxth : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uxtw : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; // // While comparisons // -def int_aarch64_sve_whilele : AdvSIMD_SVE_WHILE_Intrinsic; -def int_aarch64_sve_whilelo : AdvSIMD_SVE_WHILE_Intrinsic; -def int_aarch64_sve_whilels : AdvSIMD_SVE_WHILE_Intrinsic; -def int_aarch64_sve_whilelt : AdvSIMD_SVE_WHILE_Intrinsic; -def int_aarch64_sve_whilege : AdvSIMD_SVE_WHILE_Intrinsic; -def int_aarch64_sve_whilegt : AdvSIMD_SVE_WHILE_Intrinsic; -def int_aarch64_sve_whilehs : AdvSIMD_SVE_WHILE_Intrinsic; -def int_aarch64_sve_whilehi : AdvSIMD_SVE_WHILE_Intrinsic; +def int_aarch64_sve_whilele : AdvSIMD_SVE_WHILE_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilelo : AdvSIMD_SVE_WHILE_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilels : AdvSIMD_SVE_WHILE_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilelt : AdvSIMD_SVE_WHILE_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilege : AdvSIMD_SVE_WHILE_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilegt : AdvSIMD_SVE_WHILE_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilehs : AdvSIMD_SVE_WHILE_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilehi : AdvSIMD_SVE_WHILE_Intrinsic<[IntrSpeculatable]>; // // Floating-point arithmetic @@ -2254,32 +2253,32 @@ def int_aarch64_sve_ptrue : AdvSIMD_SVE_PTRUE_Intrinsic; // Predicate operations // -def int_aarch64_sve_and_z : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_bic_z : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_brka : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_brka_z : AdvSIMD_Pred1VectorArg_Intrinsic; -def int_aarch64_sve_brkb : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_brkb_z : AdvSIMD_Pred1VectorArg_Intrinsic; -def int_aarch64_sve_brkn_z : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_brkpa_z : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_brkpb_z : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_eor_z : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_nand_z : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_nor_z : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_orn_z : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_orr_z : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_pfirst : AdvSIMD_Pred1VectorArg_Intrinsic; -def int_aarch64_sve_pnext : AdvSIMD_Pred1VectorArg_Intrinsic; -def int_aarch64_sve_punpkhi : AdvSIMD_SVE_PUNPKHI_Intrinsic; -def int_aarch64_sve_punpklo : AdvSIMD_SVE_PUNPKHI_Intrinsic; +def int_aarch64_sve_and_z : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_bic_z : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_brka : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_brka_z : AdvSIMD_Pred1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_brkb : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_brkb_z : AdvSIMD_Pred1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_brkn_z : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_brkpa_z : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_brkpb_z : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_eor_z : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_nand_z : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_nor_z : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_orn_z : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_orr_z : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_pfirst : AdvSIMD_Pred1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_pnext : AdvSIMD_Pred1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_punpkhi : AdvSIMD_SVE_PUNPKHI_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_punpklo : AdvSIMD_SVE_PUNPKHI_Intrinsic<[IntrSpeculatable]>; // // Testing predicates // -def int_aarch64_sve_ptest_any : AdvSIMD_SVE_PTEST_Intrinsic; -def int_aarch64_sve_ptest_first : AdvSIMD_SVE_PTEST_Intrinsic; -def int_aarch64_sve_ptest_last : AdvSIMD_SVE_PTEST_Intrinsic; +def int_aarch64_sve_ptest_any : AdvSIMD_SVE_PTEST_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ptest_first : AdvSIMD_SVE_PTEST_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ptest_last : AdvSIMD_SVE_PTEST_Intrinsic<[IntrSpeculatable]>; // // Reinterpreting data @@ -2287,11 +2286,11 @@ def int_aarch64_sve_ptest_last : AdvSIMD_SVE_PTEST_Intrinsic; def int_aarch64_sve_convert_from_svbool : DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_nxv16i1_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; def int_aarch64_sve_convert_to_svbool : DefaultAttrsIntrinsic<[llvm_nxv16i1_ty], [llvm_any_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; // // Gather loads: scalar base + vector offsets @@ -2434,134 +2433,134 @@ def int_aarch64_sve_stnt1_scatter_scalar_offset : AdvSIMD_ScatterStore_VS_Intri // SVE2 - Uniform DSP operations // -def int_aarch64_sve_saba : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_shadd : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_shsub : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_shsubr : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sli : AdvSIMD_2VectorArgIndexed_Intrinsic; -def int_aarch64_sve_sqabs : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_sqadd : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sqdmulh : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_sqdmulh_lane : AdvSIMD_2VectorArgIndexed_Intrinsic; -def int_aarch64_sve_sqneg : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_sqrdmlah : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_sqrdmlah_lane : AdvSIMD_3VectorArgIndexed_Intrinsic; -def int_aarch64_sve_sqrdmlsh : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_sqrdmlsh_lane : AdvSIMD_3VectorArgIndexed_Intrinsic; -def int_aarch64_sve_sqrdmulh : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_sqrdmulh_lane : AdvSIMD_2VectorArgIndexed_Intrinsic; -def int_aarch64_sve_sqrshl : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sqshl : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sqshlu : AdvSIMD_SVE_ShiftByImm_Intrinsic; -def int_aarch64_sve_sqsub : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sqsub_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sqsubr : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_srhadd : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sri : AdvSIMD_2VectorArgIndexed_Intrinsic; -def int_aarch64_sve_srshl : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_srshr : AdvSIMD_SVE_ShiftByImm_Intrinsic; -def int_aarch64_sve_srsra : AdvSIMD_2VectorArgIndexed_Intrinsic; -def int_aarch64_sve_ssra : AdvSIMD_2VectorArgIndexed_Intrinsic; -def int_aarch64_sve_suqadd : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uaba : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_uhadd : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uhsub : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uhsubr : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uqadd : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uqrshl : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uqshl : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uqsub : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uqsub_u : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uqsubr : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_urecpe : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_urhadd : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_urshl : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_urshr : AdvSIMD_SVE_ShiftByImm_Intrinsic; -def int_aarch64_sve_ursqrte : AdvSIMD_Merged1VectorArg_Intrinsic; -def int_aarch64_sve_ursra : AdvSIMD_2VectorArgIndexed_Intrinsic; -def int_aarch64_sve_usqadd : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_usra : AdvSIMD_2VectorArgIndexed_Intrinsic; +def int_aarch64_sve_saba : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_shadd : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_shsub : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_shsubr : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sli : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqabs : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqadd : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmulh : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmulh_lane : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqneg : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrdmlah : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrdmlah_lane : AdvSIMD_3VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrdmlsh : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrdmlsh_lane : AdvSIMD_3VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrdmulh : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrdmulh_lane : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrshl : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqshl : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqshlu : AdvSIMD_SVE_ShiftByImm_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqsub : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqsub_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqsubr : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_srhadd : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sri : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_srshl : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_srshr : AdvSIMD_SVE_ShiftByImm_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_srsra : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ssra : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_suqadd : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uaba : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uhadd : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uhsub : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uhsubr : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqadd : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqrshl : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqshl : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqsub : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqsub_u : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqsubr : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_urecpe : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_urhadd : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_urshl : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_urshr : AdvSIMD_SVE_ShiftByImm_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ursqrte : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ursra : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_usqadd : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_usra : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Widening DSP operations // -def int_aarch64_sve_sabalb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_sabalt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_sabdlb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_sabdlt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_saddlb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_saddlt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_saddwb : SVE2_2VectorArg_Wide_Intrinsic; -def int_aarch64_sve_saddwt : SVE2_2VectorArg_Wide_Intrinsic; -def int_aarch64_sve_sshllb : SVE2_1VectorArg_Long_Intrinsic; -def int_aarch64_sve_sshllt : SVE2_1VectorArg_Long_Intrinsic; -def int_aarch64_sve_ssublb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_ssublt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_ssubwb : SVE2_2VectorArg_Wide_Intrinsic; -def int_aarch64_sve_ssubwt : SVE2_2VectorArg_Wide_Intrinsic; -def int_aarch64_sve_uabalb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_uabalt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_uabdlb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_uabdlt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_uaddlb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_uaddlt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_uaddwb : SVE2_2VectorArg_Wide_Intrinsic; -def int_aarch64_sve_uaddwt : SVE2_2VectorArg_Wide_Intrinsic; -def int_aarch64_sve_ushllb : SVE2_1VectorArg_Long_Intrinsic; -def int_aarch64_sve_ushllt : SVE2_1VectorArg_Long_Intrinsic; -def int_aarch64_sve_usublb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_usublt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_usubwb : SVE2_2VectorArg_Wide_Intrinsic; -def int_aarch64_sve_usubwt : SVE2_2VectorArg_Wide_Intrinsic; +def int_aarch64_sve_sabalb : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sabalt : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sabdlb : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sabdlt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_saddlb : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_saddlt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_saddwb : SVE2_2VectorArg_Wide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_saddwt : SVE2_2VectorArg_Wide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sshllb : SVE2_1VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sshllt : SVE2_1VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ssublb : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ssublt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ssubwb : SVE2_2VectorArg_Wide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ssubwt : SVE2_2VectorArg_Wide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uabalb : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uabalt : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uabdlb : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uabdlt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uaddlb : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uaddlt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uaddwb : SVE2_2VectorArg_Wide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uaddwt : SVE2_2VectorArg_Wide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ushllb : SVE2_1VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ushllt : SVE2_1VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_usublb : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_usublt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_usubwb : SVE2_2VectorArg_Wide_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_usubwt : SVE2_2VectorArg_Wide_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Non-widening pairwise arithmetic // -def int_aarch64_sve_addp : AdvSIMD_Pred2VectorArg_Intrinsic; +def int_aarch64_sve_addp : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; def int_aarch64_sve_faddp : AdvSIMD_Pred2VectorArg_Intrinsic; def int_aarch64_sve_fmaxp : AdvSIMD_Pred2VectorArg_Intrinsic; def int_aarch64_sve_fmaxnmp : AdvSIMD_Pred2VectorArg_Intrinsic; def int_aarch64_sve_fminp : AdvSIMD_Pred2VectorArg_Intrinsic; def int_aarch64_sve_fminnmp : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_smaxp : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_sminp : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_umaxp : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_uminp : AdvSIMD_Pred2VectorArg_Intrinsic; +def int_aarch64_sve_smaxp : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sminp : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umaxp : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uminp : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Widening pairwise arithmetic // -def int_aarch64_sve_sadalp : SVE2_2VectorArg_Pred_Long_Intrinsic; -def int_aarch64_sve_uadalp : SVE2_2VectorArg_Pred_Long_Intrinsic; +def int_aarch64_sve_sadalp : SVE2_2VectorArg_Pred_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uadalp : SVE2_2VectorArg_Pred_Long_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Uniform complex integer arithmetic // -def int_aarch64_sve_cadd_x : AdvSIMD_SVE2_CADD_Intrinsic; -def int_aarch64_sve_sqcadd_x : AdvSIMD_SVE2_CADD_Intrinsic; -def int_aarch64_sve_cmla_x : AdvSIMD_SVE2_CMLA_Intrinsic; -def int_aarch64_sve_cmla_lane_x : AdvSIMD_SVE_CMLA_LANE_Intrinsic; -def int_aarch64_sve_sqrdcmlah_x : AdvSIMD_SVE2_CMLA_Intrinsic; -def int_aarch64_sve_sqrdcmlah_lane_x : AdvSIMD_SVE_CMLA_LANE_Intrinsic; +def int_aarch64_sve_cadd_x : AdvSIMD_SVE2_CADD_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqcadd_x : AdvSIMD_SVE2_CADD_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmla_x : AdvSIMD_SVE2_CMLA_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cmla_lane_x : AdvSIMD_SVE_CMLA_LANE_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrdcmlah_x : AdvSIMD_SVE2_CMLA_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrdcmlah_lane_x : AdvSIMD_SVE_CMLA_LANE_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Widening complex integer arithmetic // -def int_aarch64_sve_saddlbt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_ssublbt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_ssubltb : SVE2_2VectorArg_Long_Intrinsic; +def int_aarch64_sve_saddlbt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ssublbt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_ssubltb : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Widening complex integer dot product // -def int_aarch64_sve_cdot : AdvSIMD_SVE_DOT_Indexed_Intrinsic; -def int_aarch64_sve_cdot_lane : AdvSIMD_SVE_CDOT_LANE_Intrinsic; +def int_aarch64_sve_cdot : AdvSIMD_SVE_DOT_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_cdot_lane : AdvSIMD_SVE_CDOT_LANE_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Floating-point widening multiply-accumulate @@ -2586,137 +2585,137 @@ def int_aarch64_sve_flogb : AdvSIMD_SVE_LOGB_Intrinsic; // SVE2 - Vector histogram count // -def int_aarch64_sve_histcnt : AdvSIMD_Pred2VectorArg_Intrinsic; -def int_aarch64_sve_histseg : AdvSIMD_2VectorArg_Intrinsic; +def int_aarch64_sve_histcnt : AdvSIMD_Pred2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_histseg : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Character match // -def int_aarch64_sve_match : AdvSIMD_SVE_Compare_Intrinsic; -def int_aarch64_sve_nmatch : AdvSIMD_SVE_Compare_Intrinsic; +def int_aarch64_sve_match : AdvSIMD_SVE_Compare_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_nmatch : AdvSIMD_SVE_Compare_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Unary narrowing operations // -def int_aarch64_sve_sqxtnb : SVE2_1VectorArg_Narrowing_Intrinsic; -def int_aarch64_sve_sqxtnt : SVE2_Merged1VectorArg_Narrowing_Intrinsic; -def int_aarch64_sve_sqxtunb : SVE2_1VectorArg_Narrowing_Intrinsic; -def int_aarch64_sve_sqxtunt : SVE2_Merged1VectorArg_Narrowing_Intrinsic; -def int_aarch64_sve_uqxtnb : SVE2_1VectorArg_Narrowing_Intrinsic; -def int_aarch64_sve_uqxtnt : SVE2_Merged1VectorArg_Narrowing_Intrinsic; +def int_aarch64_sve_sqxtnb : SVE2_1VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqxtnt : SVE2_Merged1VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqxtunb : SVE2_1VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqxtunt : SVE2_Merged1VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqxtnb : SVE2_1VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqxtnt : SVE2_Merged1VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Binary narrowing DSP operations // -def int_aarch64_sve_addhnb : SVE2_2VectorArg_Narrowing_Intrinsic; -def int_aarch64_sve_addhnt : SVE2_Merged2VectorArg_Narrowing_Intrinsic; +def int_aarch64_sve_addhnb : SVE2_2VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_addhnt : SVE2_Merged2VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_raddhnb : SVE2_2VectorArg_Narrowing_Intrinsic; -def int_aarch64_sve_raddhnt : SVE2_Merged2VectorArg_Narrowing_Intrinsic; +def int_aarch64_sve_raddhnb : SVE2_2VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_raddhnt : SVE2_Merged2VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_subhnb : SVE2_2VectorArg_Narrowing_Intrinsic; -def int_aarch64_sve_subhnt : SVE2_Merged2VectorArg_Narrowing_Intrinsic; +def int_aarch64_sve_subhnb : SVE2_2VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_subhnt : SVE2_Merged2VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_rsubhnb : SVE2_2VectorArg_Narrowing_Intrinsic; -def int_aarch64_sve_rsubhnt : SVE2_Merged2VectorArg_Narrowing_Intrinsic; +def int_aarch64_sve_rsubhnb : SVE2_2VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_rsubhnt : SVE2_Merged2VectorArg_Narrowing_Intrinsic<[IntrSpeculatable]>; // Narrowing shift right -def int_aarch64_sve_shrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic; -def int_aarch64_sve_shrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic; +def int_aarch64_sve_shrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_shrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_rshrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic; -def int_aarch64_sve_rshrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic; +def int_aarch64_sve_rshrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_rshrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; // Saturating shift right - signed input/output -def int_aarch64_sve_sqshrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic; -def int_aarch64_sve_sqshrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic; +def int_aarch64_sve_sqshrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqshrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_sqrshrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic; -def int_aarch64_sve_sqrshrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic; +def int_aarch64_sve_sqrshrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrshrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; // Saturating shift right - unsigned input/output -def int_aarch64_sve_uqshrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic; -def int_aarch64_sve_uqshrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic; +def int_aarch64_sve_uqshrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqshrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_uqrshrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic; -def int_aarch64_sve_uqrshrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic; +def int_aarch64_sve_uqrshrnb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uqrshrnt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; // Saturating shift right - signed input, unsigned output -def int_aarch64_sve_sqshrunb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic; -def int_aarch64_sve_sqshrunt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic; +def int_aarch64_sve_sqshrunb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqshrunt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_sqrshrunb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic; -def int_aarch64_sve_sqrshrunt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic; +def int_aarch64_sve_sqrshrunb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqrshrunt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic<[IntrSpeculatable]>; // SVE2 MLA LANE. -def int_aarch64_sve_smlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_smlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_umlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_umlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_smlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_smlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_umlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_umlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_smullb_lane : SVE2_2VectorArgIndexed_Long_Intrinsic; -def int_aarch64_sve_smullt_lane : SVE2_2VectorArgIndexed_Long_Intrinsic; -def int_aarch64_sve_umullb_lane : SVE2_2VectorArgIndexed_Long_Intrinsic; -def int_aarch64_sve_umullt_lane : SVE2_2VectorArgIndexed_Long_Intrinsic; -def int_aarch64_sve_sqdmlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_sqdmlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_sqdmlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_sqdmlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_sqdmullb_lane : SVE2_2VectorArgIndexed_Long_Intrinsic; -def int_aarch64_sve_sqdmullt_lane : SVE2_2VectorArgIndexed_Long_Intrinsic; +def int_aarch64_sve_smlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smullb_lane : SVE2_2VectorArgIndexed_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smullt_lane : SVE2_2VectorArgIndexed_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umullb_lane : SVE2_2VectorArgIndexed_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umullt_lane : SVE2_2VectorArgIndexed_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmullb_lane : SVE2_2VectorArgIndexed_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmullt_lane : SVE2_2VectorArgIndexed_Long_Intrinsic<[IntrSpeculatable]>; // SVE2 MLA Unpredicated. -def int_aarch64_sve_smlalb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_smlalt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_umlalb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_umlalt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_smlslb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_smlslt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_umlslb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_umlslt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_smullb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_smullt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_umullb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_umullt : SVE2_2VectorArg_Long_Intrinsic; - -def int_aarch64_sve_sqdmlalb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_sqdmlalt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_sqdmlslb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_sqdmlslt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_sqdmullb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_sqdmullt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_sqdmlalbt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_sqdmlslbt : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_smlalb : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smlalt : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umlalb : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umlalt : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smlslb : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smlslt : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umlslb : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umlslt : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smullb : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smullt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umullb : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_umullt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; + +def int_aarch64_sve_sqdmlalb : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmlalt : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmlslb : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmlslt : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmullb : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmullt : SVE2_2VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmlalbt : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sqdmlslbt : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; // SVE2 ADDSUB Long Unpredicated. -def int_aarch64_sve_adclb : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_adclt : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_sbclb : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_sbclt : AdvSIMD_3VectorArg_Intrinsic; +def int_aarch64_sve_adclb : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_adclt : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sbclb : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sbclt : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Polynomial arithmetic // -def int_aarch64_sve_eorbt : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_eortb : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_pmullb_pair : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_pmullt_pair : AdvSIMD_2VectorArg_Intrinsic; +def int_aarch64_sve_eorbt : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_eortb : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_pmullb_pair : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_pmullt_pair : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; // // SVE2 bitwise ternary operations. // -def int_aarch64_sve_eor3 : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_bcax : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_bsl : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_bsl1n : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_bsl2n : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_nbsl : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_xar : AdvSIMD_2VectorArgIndexed_Intrinsic; +def int_aarch64_sve_eor3 : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_bcax : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_bsl : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_bsl1n : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_bsl2n : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_nbsl : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_xar : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Optional AES, SHA-3 and SM4 @@ -2725,70 +2724,70 @@ def int_aarch64_sve_xar : AdvSIMD_2VectorArgIndexed_Intrinsic; def int_aarch64_sve_aesd : ClangBuiltin<"__builtin_sve_svaesd_u8">, DefaultAttrsIntrinsic<[llvm_nxv16i8_ty], [llvm_nxv16i8_ty, llvm_nxv16i8_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; def int_aarch64_sve_aesimc : ClangBuiltin<"__builtin_sve_svaesimc_u8">, DefaultAttrsIntrinsic<[llvm_nxv16i8_ty], [llvm_nxv16i8_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; def int_aarch64_sve_aese : ClangBuiltin<"__builtin_sve_svaese_u8">, DefaultAttrsIntrinsic<[llvm_nxv16i8_ty], [llvm_nxv16i8_ty, llvm_nxv16i8_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; def int_aarch64_sve_aesmc : ClangBuiltin<"__builtin_sve_svaesmc_u8">, DefaultAttrsIntrinsic<[llvm_nxv16i8_ty], [llvm_nxv16i8_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; def int_aarch64_sve_rax1 : ClangBuiltin<"__builtin_sve_svrax1_u64">, DefaultAttrsIntrinsic<[llvm_nxv2i64_ty], [llvm_nxv2i64_ty, llvm_nxv2i64_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; def int_aarch64_sve_sm4e : ClangBuiltin<"__builtin_sve_svsm4e_u32">, DefaultAttrsIntrinsic<[llvm_nxv4i32_ty], [llvm_nxv4i32_ty, llvm_nxv4i32_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; def int_aarch64_sve_sm4ekey : ClangBuiltin<"__builtin_sve_svsm4ekey_u32">, DefaultAttrsIntrinsic<[llvm_nxv4i32_ty], [llvm_nxv4i32_ty, llvm_nxv4i32_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; // // SVE2 - Extended table lookup/permute // -def int_aarch64_sve_tbl2 : AdvSIMD_SVE2_TBX_Intrinsic; -def int_aarch64_sve_tbx : AdvSIMD_SVE2_TBX_Intrinsic; +def int_aarch64_sve_tbl2 : AdvSIMD_SVE2_TBX_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_tbx : AdvSIMD_SVE2_TBX_Intrinsic<[IntrSpeculatable]>; // // SVE2 - Lookup Table // -def int_aarch64_sve_luti2_lane : SVE2_LUTI_Inrinsic; -def int_aarch64_sve_luti4_lane : SVE2_LUTI_Inrinsic; +def int_aarch64_sve_luti2_lane : SVE2_LUTI_Inrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_luti4_lane : SVE2_LUTI_Inrinsic<[IntrSpeculatable]>; def int_aarch64_sve_luti4_lane_x2 : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_nxv16i8_ty, llvm_i32_ty], - [IntrNoMem, ImmArg>]>; + [IntrNoMem, ImmArg>, IntrSpeculatable]>; // // SVE2 - Optional bit permutation // -def int_aarch64_sve_bdep_x : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_bext_x : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_bgrp_x : AdvSIMD_2VectorArg_Intrinsic; +def int_aarch64_sve_bdep_x : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_bext_x : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_bgrp_x : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; // // SVE ACLE: 7.3. INT8 matrix multiply extensions // -def int_aarch64_sve_ummla : SVE_MatMul_Intrinsic; -def int_aarch64_sve_smmla : SVE_MatMul_Intrinsic; -def int_aarch64_sve_usmmla : SVE_MatMul_Intrinsic; +def int_aarch64_sve_ummla : SVE_MatMul_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_smmla : SVE_MatMul_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_usmmla : SVE_MatMul_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_usdot : AdvSIMD_SVE_DOT_Intrinsic; -def int_aarch64_sve_usdot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic; -def int_aarch64_sve_sudot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic; +def int_aarch64_sve_usdot : AdvSIMD_SVE_DOT_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_usdot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_sudot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic<[IntrSpeculatable]>; // // SVE ACLE: 7.4/5. FP64/FP32 matrix multiply extensions @@ -2885,14 +2884,14 @@ def int_aarch64_sve_stnt1_pn_x4 : SVE2p1_Store_PN_X4_Intrinsic; // SVE2 - Contiguous conflict detection // -def int_aarch64_sve_whilerw_b : SVE2_CONFLICT_DETECT_Intrinsic; -def int_aarch64_sve_whilerw_h : SVE2_CONFLICT_DETECT_Intrinsic; -def int_aarch64_sve_whilerw_s : SVE2_CONFLICT_DETECT_Intrinsic; -def int_aarch64_sve_whilerw_d : SVE2_CONFLICT_DETECT_Intrinsic; -def int_aarch64_sve_whilewr_b : SVE2_CONFLICT_DETECT_Intrinsic; -def int_aarch64_sve_whilewr_h : SVE2_CONFLICT_DETECT_Intrinsic; -def int_aarch64_sve_whilewr_s : SVE2_CONFLICT_DETECT_Intrinsic; -def int_aarch64_sve_whilewr_d : SVE2_CONFLICT_DETECT_Intrinsic; +def int_aarch64_sve_whilerw_b : SVE2_CONFLICT_DETECT_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilerw_h : SVE2_CONFLICT_DETECT_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilerw_s : SVE2_CONFLICT_DETECT_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilerw_d : SVE2_CONFLICT_DETECT_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilewr_b : SVE2_CONFLICT_DETECT_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilewr_h : SVE2_CONFLICT_DETECT_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilewr_s : SVE2_CONFLICT_DETECT_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_whilewr_d : SVE2_CONFLICT_DETECT_Intrinsic<[IntrSpeculatable]>; // Scalable Matrix Extension (SME) Intrinsics let TargetPrefix = "aarch64" in { @@ -3127,8 +3126,8 @@ let TargetPrefix = "aarch64" in { // Clamp // - def int_aarch64_sve_sclamp : AdvSIMD_3VectorArg_Intrinsic; - def int_aarch64_sve_uclamp : AdvSIMD_3VectorArg_Intrinsic; + def int_aarch64_sve_sclamp : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; + def int_aarch64_sve_uclamp : AdvSIMD_3VectorArg_Intrinsic<[IntrSpeculatable]>; def int_aarch64_sve_fclamp : AdvSIMD_3VectorArg_Intrinsic; @@ -3136,7 +3135,7 @@ let TargetPrefix = "aarch64" in { // Reversal // - def int_aarch64_sve_revd : AdvSIMD_Merged1VectorArg_Intrinsic; + def int_aarch64_sve_revd : AdvSIMD_Merged1VectorArg_Intrinsic<[IntrSpeculatable]>; // // Predicate selection @@ -3837,11 +3836,11 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sve_uzpq_x4 : SVE2_VG4_ZipUzp_Intrinsic; // Vector dot-products (2-way) - def int_aarch64_sve_sdot_x2 : SVE2_3VectorArg_Long_Intrinsic; - def int_aarch64_sve_udot_x2 : SVE2_3VectorArg_Long_Intrinsic; + def int_aarch64_sve_sdot_x2 : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; + def int_aarch64_sve_udot_x2 : SVE2_3VectorArg_Long_Intrinsic<[IntrSpeculatable]>; def int_aarch64_sve_fdot_x2 : SVE2_3VectorArg_Long_Intrinsic; - def int_aarch64_sve_sdot_lane_x2 : SVE2_3VectorArgIndexed_Long_Intrinsic; - def int_aarch64_sve_udot_lane_x2 : SVE2_3VectorArgIndexed_Long_Intrinsic; + def int_aarch64_sve_sdot_lane_x2 : SVE2_3VectorArgIndexed_Long_Intrinsic<[IntrSpeculatable]>; + def int_aarch64_sve_udot_lane_x2 : SVE2_3VectorArgIndexed_Long_Intrinsic<[IntrSpeculatable]>; def int_aarch64_sve_fdot_lane_x2 : SVE2_3VectorArgIndexed_Long_Intrinsic; // @@ -3932,30 +3931,30 @@ let TargetPrefix = "aarch64" in { // SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2 // -def int_aarch64_sve_zipq1 : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_zipq2 : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_uzpq1 : AdvSIMD_2VectorArg_Intrinsic; -def int_aarch64_sve_uzpq2 : AdvSIMD_2VectorArg_Intrinsic; +def int_aarch64_sve_zipq1 : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_zipq2 : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzpq1 : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_uzpq2 : AdvSIMD_2VectorArg_Intrinsic<[IntrSpeculatable]>; // SVE2.1 - Programmable table lookup within each quadword vector segment // (zeroing)/(merging) // -def int_aarch64_sve_tblq : AdvSIMD_SVE_TBL_Intrinsic; -def int_aarch64_sve_tbxq : AdvSIMD_SVE2_TBX_Intrinsic; +def int_aarch64_sve_tblq : AdvSIMD_SVE_TBL_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_tbxq : AdvSIMD_SVE2_TBX_Intrinsic<[IntrSpeculatable]>; // SVE2.1 - Extract vector segment from each pair of quadword segments. // -def int_aarch64_sve_extq : AdvSIMD_2VectorArgIndexed_Intrinsic; +def int_aarch64_sve_extq : AdvSIMD_2VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; // // SVE2.1 - Move predicate to/from vector // -def int_aarch64_sve_pmov_to_pred_lane : SVE2_1VectorArgIndexed_Pred_Intrinsic; +def int_aarch64_sve_pmov_to_pred_lane : SVE2_1VectorArgIndexed_Pred_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic; +def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic<[IntrSpeculatable]>; -def int_aarch64_sve_pmov_to_vector_lane_merging : SVE2_Pred_1VectorArgIndexed_Intrinsic; -def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic; +def int_aarch64_sve_pmov_to_vector_lane_merging : SVE2_Pred_1VectorArgIndexed_Intrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic<[IntrSpeculatable]>; def int_aarch64_sme_mopa_nonwide : SME_OuterProduct_Intrinsic; def int_aarch64_sme_mops_nonwide : SME_OuterProduct_Intrinsic; diff --git a/llvm/test/Assembler/aarch64-intrinsics-attributes.ll b/llvm/test/Assembler/aarch64-intrinsics-attributes.ll index 33f2758a4b18c..42691bbb01bc8 100644 --- a/llvm/test/Assembler/aarch64-intrinsics-attributes.ll +++ b/llvm/test/Assembler/aarch64-intrinsics-attributes.ll @@ -19,7 +19,7 @@ declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32) ; CHECK: declare <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) [[NO_CALLBACK_NOFREE_NOSYNC_NOUNWIND_READNONE_WILLRETURN]] declare <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) -; CHECK: declare @llvm.aarch64.sve.dup.nxv4i32(, , i32) [[NO_CALLBACK_NOFREE_NOSYNC_NOUNWIND_READNONE_WILLRETURN]] +; CHECK: declare @llvm.aarch64.sve.dup.nxv4i32(, , i32) [[NO_CALLBACK_NOFREE_NOSYNC_NOUNWIND_SPECULATABLE_READNONE_WILLRETURN:#[0-9]+]] declare @llvm.aarch64.sve.dup.nxv4i32(, , i32) ; CHECK: declare void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32>, <4 x i32>, ptr captures(none)) [[NO_CALLBACK_NOFREE_NOSYNC_NOUNWIND_WRITEONLY_WILLRETURN:#[0-9]+]] @@ -33,4 +33,5 @@ declare void @llvm.aarch64.neon.st1x2.v8i16.p0(<8 x i16>, <8 x i16>, ptr) ; CHECK: attributes [[NOFREE_NOUNWIND_WILLRETURN]] = { nofree nounwind willreturn } ; CHECK: attributes [[NO_CALLBACK_NOFREE_NOSYNC_NOUNWIND_READNONE_WILLRETURN]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes [[NO_CALLBACK_NOFREE_NOSYNC_NOUNWIND_SPECULATABLE_READNONE_WILLRETURN]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK: attributes [[NO_CALLBACK_NOFREE_NOSYNC_NOUNWIND_WRITEONLY_WILLRETURN]] = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } diff --git a/llvm/test/Transforms/LICM/AArch64/speculative-intrinsic-hoisting.ll b/llvm/test/Transforms/LICM/AArch64/speculative-intrinsic-hoisting.ll new file mode 100644 index 0000000000000..72463b07521eb --- /dev/null +++ b/llvm/test/Transforms/LICM/AArch64/speculative-intrinsic-hoisting.ll @@ -0,0 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=licm < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define i64 @sve_uaddv( %inv, i1 %c) { +; CHECK-LABEL: define i64 @sve_uaddv( +; CHECK-SAME: [[INV:%.*]], i1 [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[UADDV:%.*]] = call i64 @llvm.aarch64.sve.uaddv.nxv4i32( splat (i1 true), [[INV]]) +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[BACKEDGE_COND:%.*]] = icmp ult i64 [[IV]], [[UADDV]] +; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[C]], i1 [[BACKEDGE_COND]], i1 false +; CHECK-NEXT: br i1 [[OR_COND]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ] +; CHECK-NEXT: ret i64 [[IV_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %cond.true ] + %iv.next = add i64 %iv, 1 + br i1 %c, label %cond.true, label %exit + +cond.true: + %uaddv = call i64 @llvm.aarch64.sve.uaddv.nxv4i32( splat (i1 true), %inv) + %backedge.cond = icmp ult i64 %iv, %uaddv + br i1 %backedge.cond, label %loop, label %exit + +exit: + ret i64 %iv +} + +define i64 @sve_faddv( %inv, i1 %c) { +; CHECK-LABEL: define i64 @sve_faddv( +; CHECK-SAME: [[INV:%.*]], i1 [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[COND_TRUE:.*]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: br i1 [[C]], label %[[COND_TRUE]], label %[[EXIT:.*]] +; CHECK: [[COND_TRUE]]: +; CHECK-NEXT: [[FADDV:%.*]] = call float @llvm.aarch64.sve.faddv.nxv4f32( splat (i1 true), [[INV]]) +; CHECK-NEXT: [[IV_AS_FLOAT:%.*]] = sitofp i64 [[IV]] to float +; CHECK-NEXT: [[BACKEDGE_COND:%.*]] = fcmp olt float [[IV_AS_FLOAT]], [[FADDV]] +; CHECK-NEXT: br i1 [[BACKEDGE_COND]], label %[[LOOP]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[COND_TRUE]] ], [ [[IV]], %[[LOOP]] ] +; CHECK-NEXT: ret i64 [[IV_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %cond.true ] + %iv.next = add i64 %iv, 1 + br i1 %c, label %cond.true, label %exit + +cond.true: + %faddv = call float @llvm.aarch64.sve.faddv.nxv4i32( splat (i1 true), %inv) + %iv.as.float = sitofp i64 %iv to float + %backedge.cond = fcmp olt float %iv.as.float, %faddv + br i1 %backedge.cond, label %loop, label %exit + +exit: + ret i64 %iv +} From 76dac58c9a77d9fb78a33c832f80d40f236ecd66 Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Tue, 18 Nov 2025 12:56:42 +0100 Subject: [PATCH 34/35] [MLIR][NVVM] Move the docs to markdown file (#168375) --- mlir/docs/Dialects/NVVM/_index.md | 84 +++++++++++++++++++++ mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 78 ------------------- 2 files changed, 84 insertions(+), 78 deletions(-) create mode 100644 mlir/docs/Dialects/NVVM/_index.md diff --git a/mlir/docs/Dialects/NVVM/_index.md b/mlir/docs/Dialects/NVVM/_index.md new file mode 100644 index 0000000000000..f4832f76f86ad --- /dev/null +++ b/mlir/docs/Dialects/NVVM/_index.md @@ -0,0 +1,84 @@ +# NVVM Dialect + +The NVVM dialect is MLIR's LLVM-IR-based, NVIDIA-specific backend dialect. It +models NVVM intrinsics and public ISA functionality and introduces NVIDIA +extensions to the MLIR/LLVM type system and address spaces (e.g., global, +shared, and cluster memory), enabling faithful lowering of GPU kernels to the +NVPTX toolchain. While a NVVM op usually maps to a single LLVM IR intrinsic, +the NVVM dialect uses type polymorphism and other attributes so that a single +NVVM op can map to different LLVM intrinsics. + +## Scope and Capabilities + +The dialect covers core GPU features such as thread/block builtins, barriers +and atomics, warp-level collectives (e.g., shuffle/vote), matrix/tensor core +operations (e.g., `mma.sync`, `wgmma`), tensor memory accelerator (TMA) +operations, asynchronous copies (`cp.async`, bulk/tensor variants) with memory +barriers, cache and prefetch controls, and NVVM-specific attributes and enums +(e.g., FP rounding modes, memory scopes, and MMA types/layouts). + +## Placement in the Lowering Pipeline + +NVVM sits below target-agnostic dialects like `gpu` and NVIDIA's `nvgpu`. +Typical pipelines convert `gpu`/`nvgpu` ops into NVVM using +`-convert-gpu-to-nvvm` and `-convert-nvgpu-to-nvvm`, then translate into LLVM +for final code generation via NVPTX backend. + +## Target Configuration and Serialization + +NVVM provides a `#nvvm.target` attribute to describe the GPU target (SM, +features, and flags). In conjunction with `gpu` serialization (e.g., +`gpu-module-to-binary`), this enables producing architecture-specific GPU +binaries (such as CUBIN) from nested GPU modules. + +## Inline PTX + +When an intrinsic is unavailable or a performance-critical sequence must be +expressed directly, NVVM provides an `nvvm.inline_ptx` op to embed PTX inline +as a last-resort escape hatch, with explicit operands and results. + +## Memory Spaces + +The NVVM dialect introduces the following memory spaces, each with distinct +scopes and lifetimes: + +| Memory Space | Address Space | Scope | +|-------------------|---------------|----------------------| +| `generic` | 0 | All threads | +| `global` | 1 | All threads (device) | +| `shared` | 3 | Thread block (CTA) | +| `constant` | 4 | All threads | +| `local` | 5 | Single thread | +| `tensor` | 6 | Thread block (CTA) | +| `shared_cluster` | 7 | Thread block cluster | + +### Memory Space Details + +- **generic**: Can point to any memory space; requires runtime resolution of + actual address space. Use when pointer origin is unknown at compile time. + Performance varies based on the underlying memory space. +- **global**: Accessible by all threads across all blocks; persists across + kernel launches. Highest latency but largest capacity (device memory). Best + for large data and inter-kernel communication. +- **shared**: Shared within a thread block (CTA); very fast on-chip memory for + cooperation between threads in the same block. Limited capacity. Ideal for + block-level collaboration, caching, and reducing global memory traffic. +- **constant**: Read-only memory cached per SM. Size typically limited to 64KB. + Best for read-only data and uniform values accessed by all threads. +- **local**: Private to each thread. Use for per-thread private data and + automatic variables that don't fit in registers. +- **tensor**: Special memory space for tensor core operations. Used by + `tcgen05` instructions on SM 100+ for tensor input/output operations. +- **shared_cluster**: Distributed shared memory across thread blocks within a + cluster (SM 90+). Enables collaboration beyond single-block scope with fast + access across cluster threads. + + +## Non-Goals + +NVVM is not a place for convenience or "wrapper" ops. It is not intended to +introduce high-level ops that expand into multiple unrelated NVVM intrinsics or +that lower to no intrinsic at all. Such abstractions belong in higher-level +dialects (e.g., `nvgpu`, `gpu`, or project-specific dialects). The design +intent is a thin, predictable, low-level surface with near-mechanical lowering +to NVVM/LLVM IR. \ No newline at end of file diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 87c73c4587485..524b9f820f290 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -37,84 +37,6 @@ def LLVM_PointerSharedCluster : LLVM_PointerInAddressSpace<7>; //===----------------------------------------------------------------------===// def NVVM_Dialect : Dialect { - let summary = "The NVVM dialect that models NVIDIA's public ISA"; - - let description = [{ - The NVVM dialect is MLIR's LLVM-IR-based, NVIDIA-specific backend dialect. It - models NVVM intrinsics and public ISA functionality and introduces NVIDIA - extensions to the MLIR/LLVM type system and address spaces (e.g., global, - shared, and cluster memory), enabling faithful lowering of GPU kernels to the - NVPTX toolchain. While a NVVM op usually maps to a single LLVM IR intrinsic, - the NVVM dialect uses type polymorphism and other attributes so that a single - NVVM op can map to different LLVM intrinsics. - - **Scope and capabilities:** The dialect covers core GPU features such as - thread/block builtins, barriers and atomics, warp-level collectives (e.g., - shuffle/vote), matrix/tensor core operations (e.g., `mma.sync`, `wgmma`), - tensor memory accelerator (TMA) operations, asynchronous copies (`cp.async`, - bulk/tensor variants) with memory barriers, cache and prefetch controls, and - NVVM-specific attributes and enums (e.g., FP rounding modes, memory scopes, - and MMA types/layouts). - - **Non-goals:** NVVM is not a place for convenience or “wrapper” ops. It is - not intended to introduce high-level ops that expand into multiple unrelated - NVVM intrinsics or that lower to no intrinsic at all. Such abstractions belong - in higher-level dialects (e.g., `nvgpu`, `gpu`, or project-specific dialects). - The design intent is a thin, predictable, low-level surface with - near-mechanical lowering to NVVM/LLVM IR. - - **Placement in the lowering pipeline:** NVVM sits below target-agnostic - dialects like `gpu` and NVIDIA's `nvgpu`. Typical pipelines convert - `gpu`/`nvgpu` ops into NVVM using `-convert-gpu-to-nvvm` and - `-convert-nvgpu-to-nvvm`, then translate into LLVM for final code - generation via NVPTX backend. - - **Target configuration and serialization:** NVVM provides a `#nvvm.target` - attribute to describe the GPU target (SM, features, and flags). In - conjunction with `gpu` serialization (e.g., `gpu-module-to-binary`), this - enables producing architecture-specific GPU binaries (such as CUBIN) from - nested GPU modules. - - **Inline PTX:** When an intrinsic is unavailable or a performance-critical - sequence must be expressed directly, NVVM provides an `nvvm.inline_ptx` op to - embed PTX inline as a last-resort escape hatch, with explicit operands and - results. - - - **Memory Spaces:** The NVVM dialect introduces the following memory spaces, - each with distinct scopes and lifetimes: -``` - | Memory Space | Address Space | Scope | Lifetime | - |-------------------|---------------|----------------------|-------------------| - | `generic` | 0 | All threads | Context-dependent | - | `global` | 1 | All threads (device) | Application | - | `shared` | 3 | Thread block (CTA) | Kernel execution | - | `constant` | 4 | All threads (RO) | Application | - | `local` | 5 | Single thread | Kernel execution | - | `tensor` | 6 | Thread block (CTA) | Kernel execution | - | `shared_cluster` | 7 | Thread block cluster | Kernel execution | -``` - **Memory Space Details:** - - **generic**: Can point to any memory space; requires runtime resolution of - actual address space. Use when pointer origin is unknown at compile time. - Performance varies based on the underlying memory space. - - **global**: Accessible by all threads across all blocks; persists across - kernel launches. Highest latency but largest capacity (device memory). Best - for large data and inter-kernel communication. - - **shared**: Shared within a thread block (CTA); very fast on-chip memory for - cooperation between threads in the same block. Limited capacity. Ideal for - block-level collaboration, caching, and reducing global memory traffic. - - **constant**: Read-only memory cached per SM. Size typically limited to - 64KB. Best for read-only data and uniform values accessed by all threads. - - **local**: Private to each thread. Use for per-thread private data and - automatic variables that don't fit in registers. - - **tensor**: Special memory space for tensor core operations. Used by - `tcgen05` instructions on SM 100+ for tensor input/output operations. - - **shared_cluster**: Distributed shared memory across thread blocks within - a cluster (SM 90+). Enables collaboration beyond single-block scope with - fast access across cluster threads. - }]; - let name = "nvvm"; let cppNamespace = "::mlir::NVVM"; let dependentDialects = ["LLVM::LLVMDialect"]; From 4ecfaa602f56a29ea8acd3fd39cf0cf3958b4dae Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 18 Nov 2025 12:05:02 +0000 Subject: [PATCH 35/35] [AArch64][GlobalISel] Add better basic legalization for llround. (#168427) This adds handling for f16 and f128 lround/llround under LP64 targets, promoting the f16 where needed and using a libcall for f128. This codegen is now identical to the selection dag version. --- llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 12 ++++++++++++ .../Target/AArch64/GISel/AArch64LegalizerInfo.cpp | 8 +++++--- llvm/test/CodeGen/AArch64/llround-conv-fp16.ll | 8 ++------ llvm/test/CodeGen/AArch64/llround-conv.ll | 5 +---- llvm/test/CodeGen/AArch64/lround-conv-fp16.ll | 8 ++------ llvm/test/CodeGen/AArch64/lround-conv.ll | 5 +---- 6 files changed, 23 insertions(+), 23 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index cacb292acee18..ba28e4dda3313 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -3439,6 +3439,18 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { Observer.changedInstr(MI); return Legalized; } + case TargetOpcode::G_LROUND: + case TargetOpcode::G_LLROUND: + Observer.changingInstr(MI); + + if (TypeIdx == 0) + widenScalarDst(MI, WideTy); + else + widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT); + + Observer.changedInstr(MI); + return Legalized; + case TargetOpcode::G_INTTOPTR: if (TypeIdx != 1) return UnableToLegalize; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index a88817c9d2d19..fdf69b04bf676 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -449,10 +449,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .minScalar(0, s32) .libcallFor({{s32, s32}, {s64, s32}, {s128, s32}}); - // TODO: Libcall support for s128. - // TODO: s16 should be legal with full FP16 support. getActionDefinitionsBuilder({G_LROUND, G_LLROUND}) - .legalFor({{s64, s32}, {s64, s64}}); + .legalFor({{s64, s32}, {s64, s64}}) + .legalFor(HasFP16, {{s64, s16}}) + .minScalar(0, s64) + .minScalar(1, s32) + .libcallFor({{s64, s128}}); // TODO: Custom legalization for mismatched types. getActionDefinitionsBuilder(G_FCOPYSIGN) diff --git a/llvm/test/CodeGen/AArch64/llround-conv-fp16.ll b/llvm/test/CodeGen/AArch64/llround-conv-fp16.ll index cb042757a4a42..3a4be1bda7cd6 100644 --- a/llvm/test/CodeGen/AArch64/llround-conv-fp16.ll +++ b/llvm/test/CodeGen/AArch64/llround-conv-fp16.ll @@ -1,12 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK-NOFP16 ; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK-FP16 -; RUN: llc < %s -mtriple=aarch64 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK-NOFP16,CHECK-GI -; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-GI - -; CHECK-GI: warning: Instruction selection used fallback path for testmhhs -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for testmhws -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for testmhxs +; RUN: llc < %s -mtriple=aarch64 -global-isel | FileCheck %s --check-prefixes=CHECK-NOFP16 +; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 -global-isel | FileCheck %s --check-prefixes=CHECK-FP16 define i16 @testmhhs(half %x) { ; CHECK-NOFP16-LABEL: testmhhs: diff --git a/llvm/test/CodeGen/AArch64/llround-conv.ll b/llvm/test/CodeGen/AArch64/llround-conv.ll index 4cc089804ce97..bdee73076347a 100644 --- a/llvm/test/CodeGen/AArch64/llround-conv.ll +++ b/llvm/test/CodeGen/AArch64/llround-conv.ll @@ -1,9 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc < %s -mtriple=aarch64 -mattr=+neon | FileCheck %s -; RUN: llc < %s -mtriple=aarch64 -mattr=+neon -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI - -; CHECK-GI: warning: Instruction selection used fallback path for testmswl -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for testmsll +; RUN: llc < %s -mtriple=aarch64 -mattr=+neon -global-isel | FileCheck %s define i32 @testmsws(float %x) { ; CHECK-LABEL: testmsws: diff --git a/llvm/test/CodeGen/AArch64/lround-conv-fp16.ll b/llvm/test/CodeGen/AArch64/lround-conv-fp16.ll index a29dea0eb9f9f..0b18f220067ca 100644 --- a/llvm/test/CodeGen/AArch64/lround-conv-fp16.ll +++ b/llvm/test/CodeGen/AArch64/lround-conv-fp16.ll @@ -1,12 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK-NOFP16 ; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK-FP16 -; RUN: llc < %s -mtriple=aarch64 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK-NOFP16,CHECK-GI -; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-GI - -; CHECK-GI: warning: Instruction selection used fallback path for testmhhs -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for testmhws -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for testmhxs +; RUN: llc < %s -mtriple=aarch64 -global-isel | FileCheck %s --check-prefixes=CHECK-NOFP16 +; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 -global-isel | FileCheck %s --check-prefixes=CHECK-FP16 define i16 @testmhhs(half %x) { ; CHECK-NOFP16-LABEL: testmhhs: diff --git a/llvm/test/CodeGen/AArch64/lround-conv.ll b/llvm/test/CodeGen/AArch64/lround-conv.ll index 0bf82b538e70c..4b1782457cc10 100644 --- a/llvm/test/CodeGen/AArch64/lround-conv.ll +++ b/llvm/test/CodeGen/AArch64/lround-conv.ll @@ -1,9 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc < %s -mtriple=aarch64 -mattr=+neon | FileCheck %s -; RUN: llc < %s -mtriple=aarch64 -mattr=+neon -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI - -; CHECK-GI: warning: Instruction selection used fallback path for testmswl -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for testmsll +; RUN: llc < %s -mtriple=aarch64 -mattr=+neon -global-isel | FileCheck %s define i32 @testmsws(float %x) { ; CHECK-LABEL: testmsws: